bernhardmgruber/llama.hpp

## llama.hpp
#pragma once

// ============================================================================
// == ./Meta.hpp ==
// ==
// SPDX-License-Identifier: GPL-3.0-or-later

// #pragma once
#include <boost/mp11.hpp>

#if BOOST_MP11_VERSION < 107300
//  Copyright 2015 Peter Dimov.
//
//  Distributed under the Boost Software License, Version 1.0.
//
// Boost Software License - Version 1.0 - August 17th, 2003
//
// Permission is hereby granted, free of charge, to any person or organization
// obtaining a copy of the software and accompanying documentation covered by
// this license (the "Software") to use, reproduce, display, distribute,
// execute, and transmit the Software, and to prepare derivative works of the
// Software, and to permit third-parties to whom the Software is furnished to
// do so, all subject to the following:
//
// The copyright notices in the Software and this entire statement, including
// the above license grant, this restriction and the following disclaimer,
// must be included in all copies of the Software, in whole or in part, and
// all derivative works of the Software, unless such copies or derivative
// works are solely in the form of machine-executable object code generated by
// a source language processor.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
// SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
// FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS IN THE SOFTWARE.

namespace boost::mp11
{
    namespace detail
    {
        template<class L2>
        struct mp_flatten_impl
        {
            template<class T>
            using fn = mp_if<mp_similar<L2, T>, T, mp_list<T>>;
        };
    } // namespace detail

    template<class L, class L2 = mp_clear<L>>
    using mp_flatten = mp_apply<mp_append, mp_push_front<mp_transform_q<detail::mp_flatten_impl<L2>, L>, mp_clear<L>>>;
} // namespace boost::mp11
#endif

namespace llama
{
    namespace internal
    {
        template<typename FromList, template<auto...> class ToList>
        struct mp_unwrap_values_into_impl;

        template<template<class...> class FromList, typename... Values, template<auto...> class ToList>
        struct mp_unwrap_values_into_impl<FromList<Values...>, ToList>
        {
            using type = ToList<Values::value...>;
        };

        template<typename FromList, template<auto...> class ToList>
        using mp_unwrap_values_into = typename mp_unwrap_values_into_impl<FromList, ToList>::type;
    } // namespace internal
} // namespace llama
// ==
// == ./Meta.hpp ==
// ============================================================================

// ============================================================================
// == ./macros.hpp ==
// ==
// Copyright 2018 Alexander Matthes
// SPDX-License-Identifier: GPL-3.0-or-later

// #pragma once
#ifdef __INTEL_COMPILER
#    error LLAMA has stopped supporting the Intel Classic Compiler after Intel announced its planned deprecation and \
 replacement by the Intel LLVM-based compiler. Please migrate to the Intel LLVM-based compiler.
#endif

#if defined(__INTEL_LLVM_COMPILER)
#    define LLAMA_INDEPENDENT_DATA _Pragma("ivdep")
#elif defined(__clang__)
#    define LLAMA_INDEPENDENT_DATA _Pragma("clang loop vectorize(assume_safety) interleave(assume_safety)")
#elif defined(__GNUC__)
#    define LLAMA_INDEPENDENT_DATA _Pragma("GCC ivdep")
#elif defined(_MSC_VER)
#    define LLAMA_INDEPENDENT_DATA __pragma(loop(ivdep))
#else
/// May be put in front of a loop statement. Indicates that all (!) data access inside the loop is indepent, so the
/// loop can be safely vectorized. Example: \code{.cpp}
///     LLAMA_INDEPENDENT_DATA
///     for(int i = 0; i < N; ++i)
///         // because of LLAMA_INDEPENDENT_DATA the compiler knows that a and b
///         // do not overlap and the operation can safely be vectorized
///         a[i] += b[i];
/// \endcode
#    define LLAMA_INDEPENDENT_DATA
#endif

#ifndef LLAMA_FORCE_INLINE
#    if defined(__NVCC__)
#        define LLAMA_FORCE_INLINE __forceinline__
#    elif defined(__GNUC__) || defined(__clang__)
#        define LLAMA_FORCE_INLINE inline __attribute__((always_inline))
#    elif defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER)
#        define LLAMA_FORCE_INLINE __forceinline
#    else
/// Forces the compiler to inline a function annotated with this macro
#        define LLAMA_FORCE_INLINE inline
#        warning LLAMA_FORCE_INLINE is only defined to "inline" for this compiler
#    endif
#endif

#ifndef LLAMA_PRAGMA
#    define LLAMA_PRAGMA(tokens) _Pragma(#    tokens)
#endif

#ifndef LLAMA_UNROLL
#    if defined(__NVCC__) || defined(__clang__) || defined(__INTEL_LLVM_COMPILER)
#        define LLAMA_UNROLL(...) LLAMA_PRAGMA(unroll __VA_ARGS__)
#    elif defined(__GNUG__)
#        define LLAMA_UNROLL(...) LLAMA_PRAGMA(GCC unroll __VA_ARGS__)
#    elif defined(_MSC_VER)
// MSVC does not support a pragma for unrolling
#        define LLAMA_UNROLL(...)
#    else
/// Requests the compiler to unroll the loop following this directive. An optional unrolling count may be provided as
/// argument, which must be a constant expression.
#        define LLAMA_UNROLL(...)
#        warning LLAMA_UNROLL is not implemented for your compiler
#    endif
#endif

#ifndef LLAMA_HOST_ACC
#    if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))
#        define LLAMA_HOST_ACC __host__ __device__
#    elif defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER)
#        define LLAMA_HOST_ACC
#    else
/// Some offloading parallelization language extensions such a CUDA, OpenACC or OpenMP 4.5 need to specify whether a
/// class, struct, function or method "resides" on the host, the accelerator (the offloading device) or both. LLAMA
/// supports this with marking every function needed on an accelerator with `LLAMA_HOST_ACC`.
#        define LLAMA_HOST_ACC
#        warning LLAMA_HOST_ACC is only defined empty for this compiler
#    endif
#endif

#define LLAMA_FN_HOST_ACC_INLINE LLAMA_FORCE_INLINE LLAMA_HOST_ACC

#ifndef LLAMA_LAMBDA_INLINE_WITH_SPECIFIERS
#    if defined(__clang__) || defined(__INTEL_LLVM_COMPILER)
#        define LLAMA_LAMBDA_INLINE_WITH_SPECIFIERS(...) __attribute__((always_inline)) __VA_ARGS__
#    elif defined(__GNUC__) || (defined(__NVCC__) && !defined(_MSC_VER))
#        define LLAMA_LAMBDA_INLINE_WITH_SPECIFIERS(...) __VA_ARGS__ __attribute__((always_inline))
#    elif defined(_MSC_VER)
#        define LLAMA_LAMBDA_INLINE_WITH_SPECIFIERS(...)                                                              \
            __VA_ARGS__ /* FIXME: MSVC cannot combine constexpr and [[msvc::forceinline]] */
#    else
#        define LLAMA_LAMBDA_INLINE_WITH_SPECIFIERS(...) __VA_ARGS__
#        warning LLAMA_LAMBDA_INLINE_WITH_SPECIFIERS not defined for this compiler
#    endif
#endif
#ifndef LLAMA_LAMBDA_INLINE
/// Gives strong indication to the compiler to inline the attributed lambda.
#    define LLAMA_LAMBDA_INLINE LLAMA_LAMBDA_INLINE_WITH_SPECIFIERS()
#endif

/// Suppresses nvcc warning: 'calling a __host__ function from __host__ __device__ function.'
#if defined(__NVCC__) && !defined(__clang__)
#    define LLAMA_SUPPRESS_HOST_DEVICE_WARNING _Pragma("nv_exec_check_disable")
#else
#    define LLAMA_SUPPRESS_HOST_DEVICE_WARNING
#endif

#if defined(_MSC_VER)
#    define LLAMA_FORCE_INLINE_RECURSIVE __pragma(inline_depth(255))
#else
/// Forces the compiler to recursively inline the call hiearchy started by the subsequent function call.
#    define LLAMA_FORCE_INLINE_RECURSIVE
#endif

/// Forces a copy of a value. This is useful to prevent ODR usage of constants when compiling for GPU targets.
#define LLAMA_COPY(x) decltype(x)(x)

// TODO(bgruber): clang 10 and 11 fail to compile this currently with the issue described here:
// https://stackoverflow.com/questions/64300832/why-does-clang-think-gccs-subrange-does-not-satisfy-gccs-ranges-begin-functi
// let's try again with clang 12
// Intel LLVM compiler is also using the clang frontend
#if(__has_include(<ranges>) && defined(__cpp_concepts) && !defined(__clang__) && !defined(__INTEL_LLVM_COMPILER))
#    define CAN_USE_RANGES 1
#else
#    define CAN_USE_RANGES 0
#endif
// ==
// == ./macros.hpp ==
// ============================================================================

// ============================================================================
// == ./Proofs.hpp ==
// ==
// SPDX-License-Identifier: GPL-3.0-or-later

// #pragma once
	// ============================================================================
	// == ./ArrayIndexRange.hpp ==
	// ==
	// #pragma once
		// ============================================================================
		// == ./ArrayExtents.hpp ==
		// ==
		// SPDX-License-Identifier: GPL-3.0-or-later

		// #pragma once
			// ============================================================================
			// == ./Array.hpp ==
			// ==
			// Copyright 2018 Alexander Matthes
			// SPDX-License-Identifier: GPL-3.0-or-later

			// #pragma once
			// #include "macros.hpp"    // amalgamate: file already expanded

			#include <ostream>
			#include <tuple>

			namespace llama
			{
			    /// Array class like `std::array` but suitable for use with offloading devices like GPUs.
			    /// \tparam T type if array elements.
			    /// \tparam N rank of the array.
			    template<typename T, std::size_t N>
			    struct Array
			    {
			        using value_type = T;
			        T element[N > 0 ? N : 1];

			        LLAMA_FN_HOST_ACC_INLINE constexpr auto size() const
			        {
			            return N;
			        }

			        LLAMA_FN_HOST_ACC_INLINE constexpr auto begin() -> T*
			        {
			            return &element[0];
			        }

			        LLAMA_FN_HOST_ACC_INLINE constexpr auto begin() const -> const T*
			        {
			            return &element[0];
			        }

			        LLAMA_FN_HOST_ACC_INLINE constexpr auto end() -> T*
			        {
			            return &element[N];
			        }

			        LLAMA_FN_HOST_ACC_INLINE constexpr auto end() const -> const T*
			        {
			            return &element[N];
			        }

			        template<typename IndexType>
			        LLAMA_FN_HOST_ACC_INLINE constexpr auto operator[](IndexType&& idx) -> T&
			        {
			            return element[idx];
			        }

			        template<typename IndexType>
			        LLAMA_FN_HOST_ACC_INLINE constexpr auto operator[](IndexType&& idx) const -> T const&
			        {
			            return element[idx];
			        }

			        LLAMA_FN_HOST_ACC_INLINE constexpr friend auto operator==(const Array& a, const Array& b) -> bool
			        {
			            for(std::size_t i = 0; i < N; ++i)
			                if(a.element[i] != b.element[i])
			                    return false;
			            return true;
			        }

			        LLAMA_FN_HOST_ACC_INLINE constexpr friend auto operator!=(const Array& a, const Array& b) -> bool
			        {
			            return !(a == b);
			        }

			        LLAMA_FN_HOST_ACC_INLINE constexpr friend auto operator+(const Array& a, const Array& b) -> Array
			        {
			            Array temp{};
			            for(std::size_t i = 0; i < N; ++i)
			                temp[i] = a[i] + b[i];
			            return temp;
			        }

			        template<std::size_t I>
			        constexpr auto get() -> T&
			        {
			            return element[I];
			        }

			        template<std::size_t I>
			        constexpr auto get() const -> const T&
			        {
			            return element[I];
			        }
			    };

			    template<typename T>
			    struct Array<T, 0>
			    {
			        using value_type = T;

			        LLAMA_FN_HOST_ACC_INLINE constexpr auto size() const
			        {
			            return 0;
			        }

			        LLAMA_FN_HOST_ACC_INLINE constexpr auto begin() -> T*
			        {
			            return nullptr;
			        }

			        LLAMA_FN_HOST_ACC_INLINE constexpr auto begin() const -> const T*
			        {
			            return nullptr;
			        }

			        LLAMA_FN_HOST_ACC_INLINE constexpr auto end() -> T*
			        {
			            return nullptr;
			        }

			        LLAMA_FN_HOST_ACC_INLINE constexpr auto end() const -> const T*
			        {
			            return nullptr;
			        }

			        LLAMA_FN_HOST_ACC_INLINE constexpr friend auto operator==(const Array&, const Array&) -> bool
			        {
			            return true;
			        }

			        LLAMA_FN_HOST_ACC_INLINE constexpr friend auto operator!=(const Array&, const Array&) -> bool
			        {
			            return false;
			        }

			        LLAMA_FN_HOST_ACC_INLINE constexpr friend auto operator+(const Array&, const Array&) -> Array
			        {
			            return {};
			        }
			    };

			    template<typename First, typename... Args>
			    Array(First, Args... args) -> Array<First, sizeof...(Args) + 1>;

			    template<typename T, std::size_t N>
			    auto operator<<(std::ostream& os, const Array<T, N>& a) -> std::ostream&
			    {
			        os << "Array{";
			        bool first = true;
			        for(auto e : a)
			        {
			            if(first)
			                first = false;
			            else
			                os << ", ";
			            os << e;
			        }
			        os << "}";
			        return os;
			    }

			    template<typename T, std::size_t N>
			    LLAMA_FN_HOST_ACC_INLINE constexpr auto push_front([[maybe_unused]] Array<T, N> a, T v) -> Array<T, N + 1>
			    {
			        Array<T, N + 1> r{};
			        r[0] = v;
			        if constexpr(N > 0)
			            for(std::size_t i = 0; i < N; i++)
			                r[i + 1] = a[i];
			        return r;
			    }

			    template<typename T, std::size_t N>
			    LLAMA_FN_HOST_ACC_INLINE constexpr auto push_back([[maybe_unused]] Array<T, N> a, T v) -> Array<T, N + 1>
			    {
			        Array<T, N + 1> r{};
			        if constexpr(N > 0)
			            for(std::size_t i = 0; i < N; i++)
			                r[i] = a[i];
			        r[N] = v;
			        return r;
			    }

			    template<typename T, std::size_t N>
			    LLAMA_FN_HOST_ACC_INLINE constexpr auto pop_back([[maybe_unused]] Array<T, N> a)
			    {
			        static_assert(N > 0);
			        Array<T, N - 1> r{};
			        if constexpr(N > 1)
			            for(std::size_t i = 0; i < N - 1; i++)
			                r[i] = a[i];
			        return r;
			    }

			    template<typename T, std::size_t N>
			    LLAMA_FN_HOST_ACC_INLINE constexpr auto pop_front([[maybe_unused]] Array<T, N> a)
			    {
			        static_assert(N > 0);
			        Array<T, N - 1> r{};
			        if constexpr(N > 1)
			            for(std::size_t i = 0; i < N - 1; i++)
			                r[i] = a[i + 1];
			        return r;
			    }

			    template<typename T, std::size_t N>
			    LLAMA_FN_HOST_ACC_INLINE constexpr auto product(Array<T, N> a) -> T
			    {
			        T prod = 1;
			        for(auto s : a)
			            prod *= s;
			        return prod;
			    }
			} // namespace llama

			namespace std
			{
			    template<typename T, size_t N>
			    struct tuple_size<llama::Array<T, N>> : integral_constant<size_t, N>
			    {
			    };

			    template<size_t I, typename T, size_t N>
			    struct tuple_element<I, llama::Array<T, N>>
			    {
			        using type = T;
			    };
			} // namespace std
			// ==
			// == ./Array.hpp ==
			// ============================================================================

		// #include "Meta.hpp"    // amalgamate: file already expanded

		#include <limits>
		#include <type_traits>

		namespace llama
		{
		    // TODO(bgruber): make this an alias in C++20, when we have CTAD for aliases
		    /// Represents a run-time index into the array dimensions.
		    /// \tparam Dim Compile-time number of dimensions.
		    template<std::size_t Dim>
		    struct ArrayIndex : Array<std::size_t, Dim>
		    {
		        static constexpr std::size_t rank = Dim;
		    };

		    static_assert(
		        std::is_trivially_default_constructible_v<ArrayIndex<1>>); // so ArrayIndex<1>{} will produce a zeroed
		                                                                   // index. Should hold for all dimensions,
		                                                                   // but just checking for <1> here.
		    static_assert(std::is_trivially_copy_constructible_v<ArrayIndex<1>>);
		    static_assert(std::is_trivially_move_constructible_v<ArrayIndex<1>>);
		    static_assert(std::is_trivially_copy_assignable_v<ArrayIndex<1>>);
		    static_assert(std::is_trivially_move_assignable_v<ArrayIndex<1>>);

		    template<typename... Args>
		    ArrayIndex(Args...) -> ArrayIndex<sizeof...(Args)>;
		} // namespace llama

		template<size_t N>
		struct std::tuple_size<llama::ArrayIndex<N>> : std::integral_constant<size_t, N>
		{
		};

		template<size_t I, size_t N>
		struct std::tuple_element<I, llama::ArrayIndex<N>>
		{
		    using type = size_t;
		};

		namespace llama
		{
		    /// Used as a template argument to \ref ArrayExtents to mark a dynamic extent.
		    inline constexpr std::size_t dyn = std::numeric_limits<std::size_t>::max();

		    /// ArrayExtents holding compile and runtime indices. This is conceptually equivalent to the std::extent of
		    /// std::mdspan. See: https://wg21.link/P0009
		    template<std::size_t... Sizes>
		    struct ArrayExtents : Array<typename ArrayIndex<sizeof...(Sizes)>::value_type, ((Sizes == dyn) + ... + 0)>
		    {
		        static constexpr std::size_t rank = sizeof...(Sizes);
		        static constexpr auto rank_dynamic = ((Sizes == dyn) + ... + 0);
		        static constexpr auto rank_static = rank - rank_dynamic;

		        using Index = ArrayIndex<rank>;
		        using value_type = typename Index::value_type;

		        template<std::size_t I>
		        LLAMA_FN_HOST_ACC_INLINE constexpr auto get() const
		        {
		            using namespace boost::mp11;
		            using TypeList = mp_list_c<std::size_t, Sizes...>;
		            constexpr auto extent = mp_at_c<TypeList, I>::value;
		            if constexpr(extent != dyn)
		                return extent;
		            else
		                return static_cast<const Array<value_type, rank_dynamic>&>(
		                    *this)[+mp_count<mp_take_c<TypeList, I>, mp_size_t<dyn>>::value];
		        }

		        LLAMA_FN_HOST_ACC_INLINE constexpr auto operator[](std::size_t i) const
		        {
		            return boost::mp11::mp_with_index<rank>(i, [&](auto ic) { return get<decltype(ic)::value>(); });
		        }

		    private:
		        template<std::size_t... Is>
		        LLAMA_FN_HOST_ACC_INLINE constexpr auto toArray(std::index_sequence<Is...>) const -> Index
		        {
		            return {get<Is>()...};
		        }

		    public:
		        LLAMA_FN_HOST_ACC_INLINE constexpr auto toArray() const -> Index
		        {
		            return toArray(std::make_index_sequence<rank>{});
		        }

		        LLAMA_FN_HOST_ACC_INLINE constexpr operator Index() const
		        {
		            return toArray();
		        }
		    };

		    template<>
		    struct ArrayExtents<>
		    {
		        static constexpr std::size_t rank = 0;
		        static constexpr auto rank_dynamic = 0;
		        static constexpr auto rank_static = 0;

		        using Index = ArrayIndex<rank>;
		        using value_type = typename Index::value_type;

		        LLAMA_FN_HOST_ACC_INLINE constexpr auto toArray() const -> Index
		        {
		            return {};
		        }

		        LLAMA_FN_HOST_ACC_INLINE constexpr operator Index() const
		        {
		            return toArray();
		        }
		    };

		    template<typename... Args>
		    ArrayExtents(Args... args) -> ArrayExtents<(Args{}, dyn)...>;

		    static_assert(std::is_trivially_default_constructible_v<ArrayExtents<1>>);
		    static_assert(std::is_trivially_copy_constructible_v<ArrayExtents<1>>);
		    static_assert(std::is_trivially_move_constructible_v<ArrayExtents<1>>);
		    static_assert(std::is_trivially_copy_assignable_v<ArrayExtents<1>>);
		    static_assert(std::is_trivially_move_assignable_v<ArrayExtents<1>>);

		    template<std::size_t... SizesA, std::size_t... SizesB>
		    LLAMA_FN_HOST_ACC_INLINE constexpr auto operator==(ArrayExtents<SizesA...> a, ArrayExtents<SizesB...> b) -> bool
		    {
		        return a.toArray() == b.toArray();
		    }

		    template<std::size_t... SizesA, std::size_t... SizesB>
		    LLAMA_FN_HOST_ACC_INLINE constexpr auto operator!=(ArrayExtents<SizesA...> a, ArrayExtents<SizesB...> b) -> bool
		    {
		        return !(a == b);
		    }

		    template<std::size_t... Sizes>
		    LLAMA_FN_HOST_ACC_INLINE constexpr auto product(ArrayExtents<Sizes...> e) ->
		        typename ArrayExtents<Sizes...>::value_type
		    {
		        return product(e.toArray());
		    }

		    /// N-dimensional ArrayExtents where all values are dynamic.
		    template<std::size_t N>
		    using ArrayExtentsDynamic = internal::
		        mp_unwrap_values_into<boost::mp11::mp_repeat_c<boost::mp11::mp_list_c<std::size_t, dyn>, N>, ArrayExtents>;

		    /// N-dimensional ArrayExtents where all values are Extent.
		    template<std::size_t N, std::size_t Extent>
		    using ArrayExtentsStatic = internal::
		        mp_unwrap_values_into<boost::mp11::mp_repeat_c<boost::mp11::mp_list_c<std::size_t, Extent>, N>, ArrayExtents>;

		    template<std::size_t Dim, typename Func, typename... OuterIndices>
		    LLAMA_FN_HOST_ACC_INLINE void forEachADCoord(
		        [[maybe_unused]] ArrayIndex<Dim> adSize,
		        Func&& func,
		        OuterIndices... outerIndices)
		    {
		        if constexpr(Dim > 0)
		            for(std::size_t i = 0; i < adSize[0]; i++)
		                forEachADCoord(ArrayIndex<Dim - 1>{pop_front(adSize)}, std::forward<Func>(func), outerIndices..., i);
		        else
		            std::forward<Func>(func)(ArrayIndex<sizeof...(outerIndices)>{outerIndices...});
		    }

		    template<std::size_t... Sizes, typename Func>
		    LLAMA_FN_HOST_ACC_INLINE void forEachADCoord(ArrayExtents<Sizes...> extents, Func&& func)
		    {
		        forEachADCoord(extents.toArray(), std::forward<Func>(func));
		    }
		} // namespace llama

		template<std::size_t... Sizes>
		struct std::tuple_size<llama::ArrayExtents<Sizes...>> : std::integral_constant<std::size_t, sizeof...(Sizes)>
		{
		};

		template<std::size_t I, std::size_t... Sizes>
		struct std::tuple_element<I, llama::ArrayExtents<Sizes...>>
		{
		    using type = typename llama::ArrayExtents<Sizes...>::value_type;
		};
		// ==
		// == ./ArrayExtents.hpp ==
		// ============================================================================

		// ============================================================================
		// == ./Core.hpp ==
		// ==
		// Copyright 2018 Alexander Matthes
		// SPDX-License-Identifier: GPL-3.0-or-later

		// #pragma once
		// #include "ArrayExtents.hpp"    // amalgamate: file already expanded
		// #include "Meta.hpp"    // amalgamate: file already expanded
			// ============================================================================
			// == ./RecordCoord.hpp ==
			// ==
			// Copyright 2018 Alexander Matthes
			// SPDX-License-Identifier: GPL-3.0-or-later

			// #pragma once
			// #include "Meta.hpp"    // amalgamate: file already expanded

			#include <array>
			// #include <ostream>    // amalgamate: file already included
			// #include <type_traits>    // amalgamate: file already included

			namespace llama
			{
			    /// Represents a coordinate for a record inside the record dimension tree.
			    /// \tparam Coords... the compile time coordinate.
			    template<std::size_t... Coords>
			    struct RecordCoord
			    {
			        /// The list of integral coordinates as `boost::mp11::mp_list`.
			        using List = boost::mp11::mp_list_c<std::size_t, Coords...>;

			        static constexpr std::size_t front = boost::mp11::mp_front<List>::value;
			        static constexpr std::size_t back = boost::mp11::mp_back<List>::value;
			        static constexpr std::size_t size = sizeof...(Coords);
			    };

			    template<>
			    struct RecordCoord<>
			    {
			        using List = boost::mp11::mp_list_c<std::size_t>;

			        static constexpr std::size_t size = 0;
			    };

			    template<std::size_t... CoordsA, std::size_t... CoordsB>
			    LLAMA_FN_HOST_ACC_INLINE constexpr auto operator==(RecordCoord<CoordsA...>, RecordCoord<CoordsB...>)
			    {
			        return false;
			    }

			    template<std::size_t... Coords>
			    LLAMA_FN_HOST_ACC_INLINE constexpr auto operator==(RecordCoord<Coords...>, RecordCoord<Coords...>)
			    {
			        return true;
			    }

			    template<std::size_t... CoordsA, std::size_t... CoordsB>
			    LLAMA_FN_HOST_ACC_INLINE constexpr auto operator!=(RecordCoord<CoordsA...> a, RecordCoord<CoordsB...> b)
			    {
			        return !(a == b);
			    }

			    template<typename T>
			    inline constexpr bool isRecordCoord = false;

			    template<std::size_t... Coords>
			    inline constexpr bool isRecordCoord<RecordCoord<Coords...>> = true;

			    template<std::size_t... RCs>
			    auto operator<<(std::ostream& os, RecordCoord<RCs...>) -> std::ostream&
			    {
			        os << "RecordCoord<";
			        bool first = true;
			        for(auto rc : std::array<std::size_t, sizeof...(RCs)>{RCs...})
			        {
			            if(first)
			                first = false;
			            else
			                os << ", ";
			            os << rc;
			        }
			        os << ">";
			        return os;
			    }

			    inline namespace literals
			    {
			        /// Literal operator for converting a numeric literal into a \ref RecordCoord.
			        template<char... Digits>
			        constexpr auto operator"" _RC()
			        {
			            constexpr auto coord = []() constexpr
			            {
			                char digits[] = {(Digits - 48)...};
			                std::size_t acc = 0;
			                std ::size_t powerOf10 = 1;
			                for(int i = sizeof...(Digits) - 1; i >= 0; i--)
			                {
			                    acc += digits[i] * powerOf10;
			                    powerOf10 *= 10;
			                }
			                return acc;
			            }
			            ();
			            return RecordCoord<coord>{};
			        }
			    } // namespace literals

			    /// Converts a type list of integral constants into a \ref RecordCoord.
			    template<typename L>
			    using RecordCoordFromList = internal::mp_unwrap_values_into<L, RecordCoord>;

			    /// Concatenate a set of \ref RecordCoord%s.
			    template<typename... RecordCoords>
			    using Cat = RecordCoordFromList<boost::mp11::mp_append<typename RecordCoords::List...>>;

			    /// Concatenate a set of \ref RecordCoord%s instances.
			    template<typename... RecordCoords>
			    constexpr auto cat(RecordCoords...)
			    {
			        return Cat<RecordCoords...>{};
			    }

			    /// RecordCoord without first coordinate component.
			    template<typename RecordCoord>
			    using PopFront = RecordCoordFromList<boost::mp11::mp_pop_front<typename RecordCoord::List>>;

			    namespace internal
			    {
			        template<typename First, typename Second>
			        struct RecordCoordCommonPrefixIsBiggerImpl;

			        template<std::size_t... Coords1, std::size_t... Coords2>
			        struct RecordCoordCommonPrefixIsBiggerImpl<RecordCoord<Coords1...>, RecordCoord<Coords2...>>
			        {
			            static constexpr auto value = []() constexpr
			            {
			                // CTAD does not work if Coords1/2 is an empty pack
			                std::array<std::size_t, sizeof...(Coords1)> a1{Coords1...};
			                std::array<std::size_t, sizeof...(Coords2)> a2{Coords2...};
			                for(std::size_t i = 0; i < std::min(a1.size(), a2.size()); i++)
			                {
			                    if(a1[i] > a2[i])
			                        return true;
			                    if(a1[i] < a2[i])
			                        return false;
			                }
			                return false;
			            }
			            ();
			        };
			    } // namespace internal

			    /// Checks wether the first RecordCoord is bigger than the second.
			    template<typename First, typename Second>
			    inline constexpr auto RecordCoordCommonPrefixIsBigger
			        = internal::RecordCoordCommonPrefixIsBiggerImpl<First, Second>::value;

			    namespace internal
			    {
			        template<typename First, typename Second>
			        struct RecordCoordCommonPrefixIsSameImpl;

			        template<std::size_t... Coords1, std::size_t... Coords2>
			        struct RecordCoordCommonPrefixIsSameImpl<RecordCoord<Coords1...>, RecordCoord<Coords2...>>
			        {
			            static constexpr auto value = []() constexpr
			            {
			                // CTAD does not work if Coords1/2 is an empty pack
			                std::array<std::size_t, sizeof...(Coords1)> a1{Coords1...};
			                std::array<std::size_t, sizeof...(Coords2)> a2{Coords2...};
			                for(std::size_t i = 0; i < std::min(a1.size(), a2.size()); i++)
			                    if(a1[i] != a2[i])
			                        return false;
			                return true;
			            }
			            ();
			        };
			    } // namespace internal

			    /// Checks whether two \ref RecordCoord%s are the same or one is the prefix of the other.
			    template<typename First, typename Second>
			    inline constexpr auto RecordCoordCommonPrefixIsSame
			        = internal::RecordCoordCommonPrefixIsSameImpl<First, Second>::value;
			} // namespace llama
			// ==
			// == ./RecordCoord.hpp ==
			// ============================================================================


		#include <boost/core/demangle.hpp>
		#include <iostream>
		#include <string>
		// #include <type_traits>    // amalgamate: file already included

		namespace llama
		{
		    /// Anonymous naming for a \ref Field.
		    struct NoName
		    {
		    };

		    /// A type list of \ref Field%s which may be used to define a record dimension.
		    template<typename... Fields>
		    struct Record
		    {
		    };

		    /// @brief Tells whether the given type is allowed as a field type in LLAMA. Such types need to be trivially
		    /// constructible and trivially destructible.
		    template<typename T>
		    inline constexpr bool isAllowedFieldType = std::is_trivially_destructible_v<T>;

		    /// Record dimension tree node which may either be a leaf or refer to a child tree presented as another \ref
		    /// Record.
		    /// \tparam Tag Name of the node. May be any type (struct, class).
		    /// \tparam Type Type of the node. May be one of three cases. 1. another sub tree consisting of a nested \ref
		    /// Record. 2. an array of static size of any type, in which case a Record with as many \ref Field as the array
		    /// size is created, named \ref RecordCoord specialized on consecutive numbers I. 3. A scalar type different from
		    /// \ref Record, making this node a leaf of this type.
		    template<typename Tag, typename Type>
		    struct Field
		    {
		        static_assert(isAllowedFieldType<Type>, "This field's type is not allowed");
		    };

		    struct NrAndOffset
		    {
		        std::size_t nr;
		        std::size_t offset;

		        friend auto operator==(const NrAndOffset& a, const NrAndOffset& b) -> bool
		        {
		            return a.nr == b.nr && a.offset == b.offset;
		        }

		        friend auto operator!=(const NrAndOffset& a, const NrAndOffset& b) -> bool
		        {
		            return !(a == b);
		        }

		        friend auto operator<<(std::ostream& os, const NrAndOffset& value) -> std::ostream&
		        {
		            return os << "NrAndOffset{" << value.nr << ", " << value.offset << "}";
		        }
		    };

		    /// Get the tag from a \ref Field.
		    template<typename Field>
		    using GetFieldTag = boost::mp11::mp_first<Field>;

		    /// Get the type from a \ref Field.
		    template<typename Field>
		    using GetFieldType = boost::mp11::mp_second<Field>;

		    template<typename T>
		    inline constexpr auto isRecord = false;

		    template<typename... Fields>
		    inline constexpr auto isRecord<Record<Fields...>> = true;

		    namespace internal
		    {
		        template<typename RecordDim, typename RecordCoord>
		        struct GetTagsImpl;

		        template<typename... Fields, std::size_t FirstCoord, std::size_t... Coords>
		        struct GetTagsImpl<Record<Fields...>, RecordCoord<FirstCoord, Coords...>>
		        {
		            using Field = boost::mp11::mp_at_c<boost::mp11::mp_list<Fields...>, FirstCoord>;
		            using ChildTag = GetFieldTag<Field>;
		            using ChildType = GetFieldType<Field>;
		            using type
		                = boost::mp11::mp_push_front<typename GetTagsImpl<ChildType, RecordCoord<Coords...>>::type, ChildTag>;
		        };

		        template<typename ChildType, std::size_t Count, std::size_t FirstCoord, std::size_t... Coords>
		        struct GetTagsImpl<ChildType[Count], RecordCoord<FirstCoord, Coords...>>
		        {
		            using ChildTag = RecordCoord<FirstCoord>;
		            using type
		                = boost::mp11::mp_push_front<typename GetTagsImpl<ChildType, RecordCoord<Coords...>>::type, ChildTag>;
		        };

		        template<typename T>
		        struct GetTagsImpl<T, RecordCoord<>>
		        {
		            using type = boost::mp11::mp_list<>;
		        };
		    } // namespace internal

		    /// Get the tags of all \ref Field%s from the root of the record dimension tree until to the node identified by
		    /// \ref RecordCoord.
		    template<typename RecordDim, typename RecordCoord>
		    using GetTags = typename internal::GetTagsImpl<RecordDim, RecordCoord>::type;

		    namespace internal
		    {
		        template<typename RecordDim, typename RecordCoord>
		        struct GetTagImpl
		        {
		            using type = boost::mp11::mp_back<GetTags<RecordDim, RecordCoord>>;
		        };

		        template<typename RecordDim>
		        struct GetTagImpl<RecordDim, RecordCoord<>>
		        {
		            using type = NoName;
		        };
		    } // namespace internal

		    /// Get the tag of the \ref Field at a \ref RecordCoord inside the record dimension tree.
		    template<typename RecordDim, typename RecordCoord>
		    using GetTag = typename internal::GetTagImpl<RecordDim, RecordCoord>::type;

		    /// Is true if, starting at two coordinates in two record dimensions, all subsequent nodes in the record dimension
		    /// tree have the same tag.
		    /// \tparam RecordDimA First record dimension.
		    /// \tparam LocalA \ref RecordCoord based on StartA along which the tags are compared.
		    /// \tparam RecordDimB second record dimension.
		    /// \tparam LocalB \ref RecordCoord based on StartB along which the tags are compared.
		    template<typename RecordDimA, typename LocalA, typename RecordDimB, typename LocalB>
		    inline constexpr auto hasSameTags = []() constexpr
		    {
		        if constexpr(LocalA::size != LocalB::size)
		            return false;
		        else if constexpr(LocalA::size == 0 && LocalB::size == 0)
		            return true;
		        else
		            return std::is_same_v<GetTags<RecordDimA, LocalA>, GetTags<RecordDimB, LocalB>>;
		    }
		    ();

		    namespace internal
		    {
		        template<typename FieldList, typename Tag>
		        struct FindFieldByTag
		        {
		            template<typename Field>
		            using HasTag = std::is_same<GetFieldTag<Field>, Tag>;

		            static constexpr auto value = boost::mp11::mp_find_if<FieldList, HasTag>::value;
		        };

		        template<typename RecordDim, typename RecordCoord, typename... Tags>
		        struct GetCoordFromTagsImpl
		        {
		            static_assert(boost::mp11::mp_size<RecordDim>::value != 0, "Tag combination is not valid");
		        };

		        template<typename... Fields, std::size_t... ResultCoords, typename FirstTag, typename... Tags>
		        struct GetCoordFromTagsImpl<Record<Fields...>, RecordCoord<ResultCoords...>, FirstTag, Tags...>
		        {
		            static constexpr auto tagIndex = FindFieldByTag<boost::mp11::mp_list<Fields...>, FirstTag>::value;
		            static_assert(
		                tagIndex < sizeof...(Fields),
		                "FirstTag was not found inside this Record. Does your record dimension contain the tag you access "
		                "with?");

		            using ChildType = GetFieldType<boost::mp11::mp_at_c<Record<Fields...>, tagIndex>>;

		            using type =
		                typename GetCoordFromTagsImpl<ChildType, RecordCoord<ResultCoords..., tagIndex>, Tags...>::type;
		        };

		        template<
		            typename ChildType,
		            std::size_t Count,
		            std::size_t... ResultCoords,
		            typename FirstTag,
		            typename... Tags>
		        struct GetCoordFromTagsImpl<ChildType[Count], RecordCoord<ResultCoords...>, FirstTag, Tags...>
		        {
		            static_assert(isRecordCoord<FirstTag>, "Please use a RecordCoord<I> to index into static arrays");
		            static_assert(FirstTag::size == 1, "Expected RecordCoord with 1 coordinate");
		            static_assert(FirstTag::front < Count, "Index out of bounds");

		            using type =
		                typename GetCoordFromTagsImpl<ChildType, RecordCoord<ResultCoords..., FirstTag::front>, Tags...>::type;
		        };

		        template<typename RecordDim, typename RecordCoord>
		        struct GetCoordFromTagsImpl<RecordDim, RecordCoord>
		        {
		            using type = RecordCoord;
		        };

		        // unpack a list of tags
		        template<typename... Fields, typename... Tags>
		        struct GetCoordFromTagsImpl<Record<Fields...>, RecordCoord<>, boost::mp11::mp_list<Tags...>>
		            : GetCoordFromTagsImpl<Record<Fields...>, RecordCoord<>, Tags...>
		        {
		        };
		        template<typename ChildType, std::size_t Count, typename... Tags>
		        struct GetCoordFromTagsImpl<ChildType[Count], RecordCoord<>, boost::mp11::mp_list<Tags...>>
		            : GetCoordFromTagsImpl<ChildType[Count], RecordCoord<>, Tags...>
		        {
		        };
		    } // namespace internal

		    /// Converts a series of tags, or a list of tags, navigating down a record dimension into a \ref RecordCoord.
		    template<typename RecordDim, typename... Tags>
		    using GetCoordFromTags = typename internal::GetCoordFromTagsImpl<RecordDim, RecordCoord<>, Tags...>::type;

		    namespace internal
		    {
		        template<typename RecordDim, typename... RecordCoordOrTags>
		        struct GetTypeImpl
		        {
		            using type = typename GetTypeImpl<RecordDim, GetCoordFromTags<RecordDim, RecordCoordOrTags...>>::type;
		        };

		        template<typename... Children, std::size_t HeadCoord, std::size_t... TailCoords>
		        struct GetTypeImpl<Record<Children...>, RecordCoord<HeadCoord, TailCoords...>>
		        {
		            using ChildType = GetFieldType<boost::mp11::mp_at_c<Record<Children...>, HeadCoord>>;
		            using type = typename GetTypeImpl<ChildType, RecordCoord<TailCoords...>>::type;
		        };

		        template<typename ChildType, std::size_t N, std::size_t HeadCoord, std::size_t... TailCoords>
		        struct GetTypeImpl<ChildType[N], RecordCoord<HeadCoord, TailCoords...>>
		        {
		            using type = typename GetTypeImpl<ChildType, RecordCoord<TailCoords...>>::type;
		        };

		        template<typename T>
		        struct GetTypeImpl<T, RecordCoord<>>
		        {
		            static_assert(isAllowedFieldType<T>);
		            using type = T;
		        };
		    } // namespace internal

		    /// Returns the type of a node in a record dimension tree identified by a given \ref RecordCoord or a series of
		    /// tags.
		    template<typename RecordDim, typename... RecordCoordOrTags>
		    using GetType = typename internal::GetTypeImpl<RecordDim, RecordCoordOrTags...>::type;

		    namespace internal
		    {
		        template<typename RecordDim, typename RecordCoord>
		        struct LeafRecordCoordsImpl;

		        template<typename T, std::size_t... RCs>
		        struct LeafRecordCoordsImpl<T, RecordCoord<RCs...>>
		        {
		            using type = boost::mp11::mp_list<RecordCoord<RCs...>>;
		        };

		        template<typename... Fields, std::size_t... RCs>
		        struct LeafRecordCoordsImpl<Record<Fields...>, RecordCoord<RCs...>>
		        {
		            template<std::size_t... Is>
		            static auto help(std::index_sequence<Is...>)
		            {
		                return boost::mp11::mp_append<
		                    typename LeafRecordCoordsImpl<GetFieldType<Fields>, RecordCoord<RCs..., Is>>::type...>{};
		            }
		            using type = decltype(help(std::make_index_sequence<sizeof...(Fields)>{}));
		        };

		        template<typename Child, std::size_t N, std::size_t... RCs>
		        struct LeafRecordCoordsImpl<Child[N], RecordCoord<RCs...>>
		        {
		            template<std::size_t... Is>
		            static auto help(std::index_sequence<Is...>)
		            {
		                return boost::mp11::mp_append<
		                    typename LeafRecordCoordsImpl<Child, RecordCoord<RCs..., Is>>::type...>{};
		            }
		            using type = decltype(help(std::make_index_sequence<N>{}));
		        };
		    } // namespace internal

		    /// Returns a flat type list containing all record coordinates to all leaves of the given record dimension.
		    template<typename RecordDim>
		    using LeafRecordCoords = typename internal::LeafRecordCoordsImpl<RecordDim, RecordCoord<>>::type;

		    namespace internal
		    {
		        // adapted from boost::mp11, but with LLAMA_FN_HOST_ACC_INLINE
		        template<template<typename...> typename L, typename... T, typename F>
		        LLAMA_FN_HOST_ACC_INLINE constexpr void mp_for_each_inlined(L<T...>, F&& f)
		        {
		            using A = int[sizeof...(T)];
		            (void) A{((void) f(T{}), 0)...};
		        }
		    } // namespace internal

		    /// Iterates over the record dimension tree and calls a functor on each element.
		    /// \param functor Functor to execute at each element of. Needs to have `operator()` with a template parameter for
		    /// the \ref RecordCoord in the record dimension tree.
		    /// \param baseCoord \ref RecordCoord at which the iteration should be started. The functor is called on elements
		    /// beneath this coordinate.
		    template<typename RecordDim, typename Functor, std::size_t... Coords>
		    LLAMA_FN_HOST_ACC_INLINE constexpr void forEachLeafCoord(Functor&& functor, RecordCoord<Coords...> baseCoord)
		    {
		        LLAMA_FORCE_INLINE_RECURSIVE
		        internal::mp_for_each_inlined(
		            LeafRecordCoords<GetType<RecordDim, RecordCoord<Coords...>>>{},
		            [&](auto innerCoord) LLAMA_LAMBDA_INLINE_WITH_SPECIFIERS(constexpr)
		            { std::forward<Functor>(functor)(cat(baseCoord, innerCoord)); });
		    }

		    /// Iterates over the record dimension tree and calls a functor on each element.
		    /// \param functor Functor to execute at each element of. Needs to have `operator()` with a template parameter for
		    /// the \ref RecordCoord in the record dimension tree.
		    /// \param baseTags Tags used to define where the iteration should be started. The functor is called on elements
		    /// beneath this coordinate.
		    template<typename RecordDim, typename Functor, typename... Tags>
		    LLAMA_FN_HOST_ACC_INLINE constexpr void forEachLeafCoord(Functor&& functor, Tags... /*baseTags*/)
		    {
		        LLAMA_FORCE_INLINE_RECURSIVE
		        forEachLeafCoord<RecordDim>(std::forward<Functor>(functor), GetCoordFromTags<RecordDim, Tags...>{});
		    }

		    namespace internal
		    {
		        template<typename T>
		        struct FlattenRecordDimImpl
		        {
		            using type = boost::mp11::mp_list<T>;
		        };

		        template<typename... Fields>
		        struct FlattenRecordDimImpl<Record<Fields...>>
		        {
		            using type = boost::mp11::mp_append<typename FlattenRecordDimImpl<GetFieldType<Fields>>::type...>;
		        };
		        template<typename Child, std::size_t N>
		        struct FlattenRecordDimImpl<Child[N]>
		        {
		            using type = boost::mp11::mp_repeat_c<typename FlattenRecordDimImpl<Child>::type, N>;
		        };
		    } // namespace internal

		    /// Returns a flat type list containing all leaf field types of the given record dimension.
		    template<typename RecordDim>
		    using FlatRecordDim = typename internal::FlattenRecordDimImpl<RecordDim>::type;

		    /// The total number of fields in the recursively expanded record dimension.
		    template<typename RecordDim>
		    inline constexpr std::size_t flatFieldCount = 1;

		    template<typename... Children>
		    inline constexpr std::size_t flatFieldCount<
		        Record<Children...>> = (flatFieldCount<GetFieldType<Children>> + ... + 0);

		    template<typename Child, std::size_t N>
		    inline constexpr std::size_t flatFieldCount<Child[N]> = flatFieldCount<Child>* N;

		    namespace internal
		    {
		        template<std::size_t I, typename RecordDim>
		        inline constexpr std::size_t flatFieldCountBefore = 0;

		        template<typename... Children>
		        inline constexpr std::size_t flatFieldCountBefore<0, Record<Children...>> = 0;

		        // recursive formulation to benefit from template instantiation memoization
		        // this massively improves compilation time when this template is instantiated with a lot of different I
		        template<std::size_t I, typename... Children>
		        inline constexpr std::size_t flatFieldCountBefore<
		            I,
		            Record<
		                Children...>> = flatFieldCountBefore<I - 1, Record<Children...>> + flatFieldCount<GetFieldType<boost::mp11::mp_at_c<Record<Children...>, I - 1>>>;
		    } // namespace internal

		    /// The equivalent zero based index into a flat record dimension (\ref FlatRecordDim) of the given hierarchical
		    /// record coordinate.
		    template<typename RecordDim, typename RecordCoord>
		    inline constexpr std::size_t flatRecordCoord = 0;

		    template<typename T>
		    inline constexpr std::size_t flatRecordCoord<T, RecordCoord<>> = 0;

		    template<typename... Children, std::size_t I, std::size_t... Is>
		    inline constexpr std::size_t flatRecordCoord<
		        Record<Children...>,
		        RecordCoord<
		            I,
		            Is...>> = internal::
		                          flatFieldCountBefore<
		                              I,
		                              Record<
		                                  Children...>> + flatRecordCoord<GetFieldType<boost::mp11::mp_at_c<Record<Children...>, I>>, RecordCoord<Is...>>;

		    template<typename Child, std::size_t N, std::size_t I, std::size_t... Is>
		    inline constexpr std::size_t flatRecordCoord<Child[N], RecordCoord<I, Is...>> = flatFieldCount<Child>* I
		        + flatRecordCoord<Child, RecordCoord<Is...>>;

		    namespace internal
		    {
		        template<typename TypeList>
		        constexpr auto flatAlignOfImpl()
		        {
		            using namespace boost::mp11;

		            std::size_t maxAlign = 0;
		            mp_for_each<mp_transform<mp_identity, TypeList>>([&](auto e) constexpr
		                                                             {
		                                                                 using T = typename decltype(e)::type;
		                                                                 maxAlign = std::max(maxAlign, alignof(T));
		                                                             });
		            return maxAlign;
		        }
		    } // namespace internal

		    /// The alignment of a type list if its elements would be in a normal struct.
		    template<typename TypeList>
		    inline constexpr std::size_t flatAlignOf = internal::flatAlignOfImpl<TypeList>();

		    /// The alignment of a type T.
		    template<typename T>
		    inline constexpr std::size_t alignOf = alignof(T);

		    /// The alignment of a record dimension if its fields would be in a normal struct.
		    template<typename... Fields>
		    inline constexpr std::size_t alignOf<Record<Fields...>> = flatAlignOf<FlatRecordDim<Record<Fields...>>>;

		    namespace internal
		    {
		        constexpr void roundUpToMultiple(std::size_t& value, std::size_t multiple)
		        {
		            value = ((value + multiple - 1) / multiple) * multiple;
		        }

		        template<typename TypeList, bool Align, bool IncludeTailPadding>
		        constexpr auto sizeOfImpl() -> std::size_t
		        {
		            using namespace boost::mp11;

		            std::size_t size = 0;
		            std::size_t maxAlign = 0;
		            mp_for_each<mp_transform<mp_identity, TypeList>>([&](auto e) constexpr
		                                                             {
		                                                                 using T = typename decltype(e)::type;
		                                                                 if constexpr(Align)
		                                                                 {
		                                                                     roundUpToMultiple(size, alignof(T));
		                                                                     maxAlign = std::max(maxAlign, alignof(T));
		                                                                 }
		                                                                 // NOLINTNEXTLINE(readability-misleading-indentation)
		                                                                 size += sizeof(T);
		                                                             });

		            // final padding, so next struct can start right away
		            if constexpr(Align && IncludeTailPadding)
		                roundUpToMultiple(size, maxAlign); // TODO(bgruber): we could use flatAlignOf<TypeList> here, at the
		                                                   // cost of more template instantiations
		            return size;
		        }

		        template<bool Align, typename TypeList, std::size_t I>
		        constexpr auto offsetOfImplWorkaround() -> std::size_t;

		        // recursive formulation to benefit from template instantiation memoization
		        // this massively improves compilation time when this template is instantiated with a lot of different I
		        template<bool Align, typename TypeList, std::size_t I>
		        inline constexpr std::size_t offsetOfImpl
		            = offsetOfImplWorkaround<Align, TypeList, I>(); // FIXME: MSVC fails to compile an IILE here.

		        template<bool Align, typename TypeList>
		        inline constexpr std::size_t offsetOfImpl<Align, TypeList, 0> = 0;

		        template<bool Align, typename TypeList, std::size_t I>
		        constexpr auto offsetOfImplWorkaround() -> std::size_t
		        {
		            std::size_t offset = offsetOfImpl<Align, TypeList, I - 1> + sizeof(boost::mp11::mp_at_c<TypeList, I - 1>);
		            if constexpr(Align)
		                roundUpToMultiple(offset, alignof(boost::mp11::mp_at_c<TypeList, I>));
		            return offset;
		        }
		    } // namespace internal

		    /// The size of a type list if its elements would be in a normal struct.
		    template<typename TypeList, bool Align, bool IncludeTailPadding = true>
		    inline constexpr std::size_t flatSizeOf = internal::sizeOfImpl<TypeList, Align, IncludeTailPadding>();

		    /// The size of a type T.
		    template<typename T, bool Align = false, bool IncludeTailPadding = true>
		    inline constexpr std::size_t sizeOf = sizeof(T);

		    /// The size of a record dimension if its fields would be in a normal struct.
		    template<typename... Fields, bool Align, bool IncludeTailPadding>
		    inline constexpr std::size_t sizeOf<Record<Fields...>, Align, IncludeTailPadding> = flatSizeOf<
		        FlatRecordDim<Record<Fields...>>,
		        Align,
		        IncludeTailPadding>;

		    /// The byte offset of an element in a type list ifs elements would be in a normal struct.
		    template<typename TypeList, std::size_t I, bool Align>
		    inline constexpr std::size_t flatOffsetOf = internal::offsetOfImpl<Align, TypeList, I>;

		    /// The byte offset of an element in a record dimension if it would be a normal struct.
		    /// \tparam RecordDim Record dimension tree.
		    /// \tparam RecordCoord Record coordinate of an element inrecord dimension tree.
		    template<typename RecordDim, typename RecordCoord, bool Align = false>
		    inline constexpr std::size_t offsetOf
		        = flatOffsetOf<FlatRecordDim<RecordDim>, flatRecordCoord<RecordDim, RecordCoord>, Align>;

		    template<typename S>
		    auto structName(S = {}) -> std::string
		    {
		        auto s = boost::core::demangle(typeid(S).name());
		        if(const auto pos = s.rfind(':'); pos != std::string::npos)
		            s = s.substr(pos + 1);
		        return s;
		    }

		    namespace internal
		    {
		        template<typename T>
		        struct IndirectValue
		        {
		            T value;

		            auto operator->() -> T*
		            {
		                return &value;
		            }

		            auto operator->() const -> const T*
		            {
		                return &value;
		            }
		        };

		        // TODO(bgruber): replace in C++20
		        template<class T>
		        struct IsBoundedArray : std::false_type
		        {
		        };

		        template<class T, std::size_t N>
		        struct IsBoundedArray<T[N]> : std::true_type
		        {
		        };
		    } // namespace internal

		    /// Returns the integral n rounded up to be a multiple of mult.
		    template<typename Integral>
		    LLAMA_FN_HOST_ACC_INLINE constexpr auto roundUpToMultiple(Integral n, Integral mult) -> Integral
		    {
		        return (n + mult - 1) / mult * mult;
		    }

		    namespace internal
		    {
		        template<typename T, template<typename> typename TypeFunctor>
		        struct TransformLeavesImpl
		        {
		            using type = TypeFunctor<T>;
		        };

		        template<typename... Fields, template<typename> typename TypeFunctor>
		        struct TransformLeavesImpl<Record<Fields...>, TypeFunctor>
		        {
		            using type = Record<
		                Field<GetFieldTag<Fields>, typename TransformLeavesImpl<GetFieldType<Fields>, TypeFunctor>::type>...>;
		        };
		        template<typename Child, std::size_t N, template<typename> typename TypeFunctor>
		        struct TransformLeavesImpl<Child[N], TypeFunctor>
		        {
		            using type = typename TransformLeavesImpl<Child, TypeFunctor>::type[N];
		        };
		    } // namespace internal

		    /// Creates a new record dimension where each new leaf field's type is the result of applying FieldTypeFunctor to
		    /// the original leaf field's type.
		    template<typename RecordDim, template<typename> typename FieldTypeFunctor>
		    using TransformLeaves = typename internal::TransformLeavesImpl<RecordDim, FieldTypeFunctor>::type;

		    namespace internal
		    {
		        // TODO: we might implement this better by expanding a record dim into a list of tag lists and then computing a
		        // real set union of the two tag list lists

		        template<typename A, typename B>
		        auto mergeRecordDimsImpl(boost::mp11::mp_identity<A> a, boost::mp11::mp_identity<B>)
		        {
		            static_assert(std::is_same_v<A, B>, "Cannot merge record and non-record or fields with different types");
		            return a;
		        }

		        template<typename A, std::size_t NA, typename B, std::size_t NB>
		        auto mergeRecordDimsImpl(
		            [[maybe_unused]] boost::mp11::mp_identity<A[NA]> a,
		            [[maybe_unused]] boost::mp11::mp_identity<B[NB]> b)
		        {
		            static_assert(std::is_same_v<A, B>, "Cannot merge arrays of different type");
		            if constexpr(NA < NB)
		                return b;
		            else
		                return a;
		        }

		        template<typename... FieldsA>
		        auto mergeRecordDimsImpl(boost::mp11::mp_identity<Record<FieldsA...>> a, boost::mp11::mp_identity<Record<>>)
		        {
		            return a;
		        }

		        template<
		            typename... FieldsA,
		            typename FieldB,
		            typename... FieldsB,
		            auto pos = FindFieldByTag<Record<FieldsA...>, GetFieldTag<FieldB>>::value>
		        auto mergeRecordDimsImpl(
		            boost::mp11::mp_identity<Record<FieldsA...>>,
		            boost::mp11::mp_identity<Record<FieldB, FieldsB...>>)
		        {
		            using namespace boost::mp11;
		            if constexpr(pos == sizeof...(FieldsA))
		            {
		                return mergeRecordDimsImpl(
		                    mp_identity<Record<FieldsA..., FieldB>>{},
		                    mp_identity<Record<FieldsB...>>{});
		            }
		            else
		            {
		                using OldFieldA = mp_at_c<Record<FieldsA...>, pos>;
		                using NewFieldA = Field<
		                    GetFieldTag<OldFieldA>,
		                    typename decltype(mergeRecordDimsImpl(
		                        mp_identity<GetFieldType<OldFieldA>>{},
		                        mp_identity<GetFieldType<FieldB>>{}))::type>;
		                using NewRecordA = mp_replace_at_c<Record<FieldsA...>, pos, NewFieldA>;
		                return mergeRecordDimsImpl(mp_identity<NewRecordA>{}, mp_identity<Record<FieldsB...>>{});
		            }
		        }
		    } // namespace internal

		    /// Creates a merged record dimension, where duplicated, nested fields are unified.
		    template<typename RecordDimA, typename RecordDimB>
		    using MergedRecordDims = typename decltype(internal::mergeRecordDimsImpl(
		        boost::mp11::mp_identity<RecordDimA>{},
		        boost::mp11::mp_identity<RecordDimB>{}))::type;

		    /// Returns the tags interspersed by '.' represented by the given record coord in the given record dimension.
		    template<typename RecordDim, std::size_t... Coords>
		    auto recordCoordTags(RecordCoord<Coords...>) -> std::string
		    {
		        using Tags = GetTags<RecordDim, RecordCoord<Coords...>>;

		        std::string r;
		        boost::mp11::mp_for_each<Tags>(
		            [&](auto tag)
		            {
		                using Tag = decltype(tag);
		                if(!r.empty())
		                    r += '.';
		                if constexpr(isRecordCoord<Tag>)
		                {
		                    static_assert(Tag::size == 1);
		                    r += std::to_string(Tag::front); // handle array indices
		                }
		                else
		                    r += structName(tag);
		            });
		        return r;
		    }
		} // namespace llama
		// ==
		// == ./Core.hpp ==
		// ============================================================================


	#include <algorithm>
	#include <iterator>
	// #include <limits>    // amalgamate: file already included
	#if CAN_USE_RANGES
	#    include <ranges>
	#endif

	namespace llama
	{
	    /// Iterator supporting \ref ArrayIndexRange.
	    template<typename ArrayExtents>
	    struct ArrayIndexIterator
	    {
	        static_assert(!std::is_const_v<ArrayExtents>);

	        using value_type = typename ArrayExtents::Index;
	        using difference_type = std::ptrdiff_t;
	        using reference = value_type;
	        using pointer = internal::IndirectValue<value_type>;
	        using iterator_category = std::random_access_iterator_tag;

	        static constexpr std::size_t rank = ArrayExtents::rank;

	        constexpr ArrayIndexIterator() noexcept = default;

	        LLAMA_FN_HOST_ACC_INLINE constexpr ArrayIndexIterator(ArrayExtents extents, value_type current) noexcept
	            : extents(extents)
	            , current(current)
	        {
	        }

	        LLAMA_FN_HOST_ACC_INLINE
	        constexpr auto operator*() const noexcept -> value_type
	        {
	            return current;
	        }

	        LLAMA_FN_HOST_ACC_INLINE
	        constexpr auto operator->() const noexcept -> pointer
	        {
	            return {**this};
	        }

	        LLAMA_FN_HOST_ACC_INLINE
	        constexpr auto operator++() noexcept -> ArrayIndexIterator&
	        {
	            current[rank - 1]++;
	            for(auto i = static_cast<int>(rank) - 2; i >= 0; i--)
	            {
	                if(current[i + 1] != extents[i + 1])
	                    return *this;
	                current[i + 1] = 0;
	                current[i]++;
	            }
	            return *this;
	        }

	        LLAMA_FN_HOST_ACC_INLINE
	        constexpr auto operator++(int) noexcept -> ArrayIndexIterator
	        {
	            auto tmp = *this;
	            ++*this;
	            return tmp;
	        }

	        LLAMA_FN_HOST_ACC_INLINE
	        constexpr auto operator--() noexcept -> ArrayIndexIterator&
	        {
	            current[rank - 1]--;
	            for(auto i = static_cast<int>(rank) - 2; i >= 0; i--)
	            {
	                if(current[i + 1] != std::numeric_limits<std::size_t>::max())
	                    return *this;
	                current[i + 1] = extents[i] - 1;
	                current[i]--;
	            }
	            // decrementing beyond [0, 0, ..., 0] is UB
	            return *this;
	        }

	        LLAMA_FN_HOST_ACC_INLINE
	        constexpr auto operator--(int) noexcept -> ArrayIndexIterator
	        {
	            auto tmp = *this;
	            --*this;
	            return tmp;
	        }

	        LLAMA_FN_HOST_ACC_INLINE
	        constexpr auto operator[](difference_type i) const noexcept -> reference
	        {
	            return *(*this + i);
	        }

	        LLAMA_FN_HOST_ACC_INLINE
	        constexpr auto operator+=(difference_type n) noexcept -> ArrayIndexIterator&
	        {
	            // add n to all lower dimensions with carry
	            for(auto i = static_cast<int>(rank) - 1; i > 0 && n != 0; i--)
	            {
	                n += static_cast<difference_type>(current[i]);
	                const auto s = static_cast<difference_type>(extents[i]);
	                auto mod = n % s;
	                n /= s;
	                if(mod < 0)
	                {
	                    mod += s;
	                    n--;
	                }
	                current[i] = mod;
	                assert(current[i] < extents[i]);
	            }

	            current[0] = static_cast<difference_type>(current[0]) + n;
	            // current is either within bounds or at the end ([last + 1, 0, 0, ..., 0])
	            assert(
	                (current[0] < extents[0]
	                 || (current[0] == extents[0]
	                     && std::all_of(std::begin(current) + 1, std::end(current), [](auto c) { return c == 0; })))
	                && "Iterator was moved past the end");

	            return *this;
	        }

	        LLAMA_FN_HOST_ACC_INLINE
	        friend constexpr auto operator+(ArrayIndexIterator it, difference_type n) noexcept -> ArrayIndexIterator
	        {
	            it += n;
	            return it;
	        }

	        LLAMA_FN_HOST_ACC_INLINE
	        friend constexpr auto operator+(difference_type n, ArrayIndexIterator it) noexcept -> ArrayIndexIterator
	        {
	            return it + n;
	        }

	        LLAMA_FN_HOST_ACC_INLINE
	        constexpr auto operator-=(difference_type n) noexcept -> ArrayIndexIterator&
	        {
	            return operator+=(-n);
	        }

	        LLAMA_FN_HOST_ACC_INLINE
	        friend constexpr auto operator-(ArrayIndexIterator it, difference_type n) noexcept -> ArrayIndexIterator
	        {
	            it -= n;
	            return it;
	        }

	        LLAMA_FN_HOST_ACC_INLINE
	        friend constexpr auto operator-(const ArrayIndexIterator& a, const ArrayIndexIterator& b) noexcept
	            -> difference_type
	        {
	            assert(a.extents == b.extents);

	            difference_type n = a.current[rank - 1] - b.current[rank - 1];
	            difference_type size = a.extents[rank - 1];
	            for(auto i = static_cast<int>(rank) - 2; i >= 0; i--)
	            {
	                n += (a.current[i] - b.current[i]) * size;
	                size *= a.extents[i];
	            }

	            return n;
	        }

	        LLAMA_FN_HOST_ACC_INLINE
	        friend constexpr auto operator==(
	            const ArrayIndexIterator<ArrayExtents>& a,
	            const ArrayIndexIterator<ArrayExtents>& b) noexcept -> bool
	        {
	            assert(a.extents == b.extents);
	            return a.current == b.current;
	        }

	        LLAMA_FN_HOST_ACC_INLINE
	        friend constexpr auto operator!=(
	            const ArrayIndexIterator<ArrayExtents>& a,
	            const ArrayIndexIterator<ArrayExtents>& b) noexcept -> bool
	        {
	            return !(a == b);
	        }

	        LLAMA_FN_HOST_ACC_INLINE
	        friend constexpr auto operator<(const ArrayIndexIterator& a, const ArrayIndexIterator& b) noexcept -> bool
	        {
	            assert(a.extents == b.extents);
	            return std::lexicographical_compare(
	                std::begin(a.current),
	                std::end(a.current),
	                std::begin(b.current),
	                std::end(b.current));
	        }

	        LLAMA_FN_HOST_ACC_INLINE
	        friend constexpr auto operator>(const ArrayIndexIterator& a, const ArrayIndexIterator& b) noexcept -> bool
	        {
	            return b < a;
	        }

	        LLAMA_FN_HOST_ACC_INLINE
	        friend constexpr auto operator<=(const ArrayIndexIterator& a, const ArrayIndexIterator& b) noexcept -> bool
	        {
	            return !(a > b);
	        }

	        LLAMA_FN_HOST_ACC_INLINE
	        friend constexpr auto operator>=(const ArrayIndexIterator& a, const ArrayIndexIterator& b) noexcept -> bool
	        {
	            return !(a < b);
	        }

	    private:
	        ArrayExtents extents; // TODO(bgruber): we only need to store rank - 1 sizes
	        value_type current;
	    };

	    /// Range allowing to iterate over all indices in an \ref ArrayExtents.
	    template<typename ArrayExtents>
	    struct ArrayIndexRange
	        : private ArrayExtents
	#if CAN_USE_RANGES
	        , std::ranges::view_base
	#endif
	    {
	        static_assert(!std::is_const_v<ArrayExtents>);

	        constexpr ArrayIndexRange() noexcept = default;

	        LLAMA_FN_HOST_ACC_INLINE
	        constexpr explicit ArrayIndexRange(ArrayExtents extents) noexcept : ArrayExtents(extents)
	        {
	        }

	        LLAMA_FN_HOST_ACC_INLINE
	        constexpr auto begin() const noexcept -> ArrayIndexIterator<ArrayExtents>
	        {
	            return {*this, typename ArrayExtents::Index{}};
	        }

	        LLAMA_FN_HOST_ACC_INLINE
	        constexpr auto end() const noexcept -> ArrayIndexIterator<ArrayExtents>
	        {
	            auto endPos = typename ArrayExtents::Index{};
	            endPos[0] = this->toArray()[0];
	            return {*this, endPos};
	        }
	    };
	} // namespace llama
	// ==
	// == ./ArrayIndexRange.hpp ==
	// ============================================================================

// #include "Core.hpp"    // amalgamate: file already expanded

namespace llama
{
    namespace internal
    {
        constexpr auto divRoundUp(std::size_t dividend, std::size_t divisor) -> std::size_t
        {
            return (dividend + divisor - 1) / divisor;
        }
    } // namespace internal

// FIXME: this test is actually not correct, because __cpp_constexpr_dynamic_alloc only guarantees constexpr
// std::allocator
#ifdef __cpp_constexpr_dynamic_alloc
    namespace internal
    {
        template<typename T>
        struct DynArray
        {
            constexpr DynArray() = default;

            constexpr DynArray(std::size_t n)
            {
                data = new T[n]{};
            }

            constexpr ~DynArray()
            {
                delete[] data;
            }

            constexpr void resize(std::size_t n)
            {
                delete[] data;
                data = new T[n]{};
            }

            T* data = nullptr;
        };
    } // namespace internal

    /// Proofs by exhaustion of the array and record dimensions, that all values mapped to memory do not overlap.
    // Unfortunately, this only works for smallish array dimensions, because of compiler limits on constexpr evaluation
    // depth.
    template<typename Mapping>
    constexpr auto mapsNonOverlappingly(const Mapping& m) -> bool
    {
        internal::DynArray<internal::DynArray<std::uint64_t>> blobByteMapped(m.blobCount);
        for(std::size_t i = 0; i < m.blobCount; i++)
            blobByteMapped.data[i].resize(internal::divRoundUp(m.blobSize(i), 64));

        auto testAndSet = [&](auto blob, auto offset) constexpr
        {
            const auto bit = std::uint64_t{1} << (offset % 64);
            if(blobByteMapped.data[blob].data[offset / 64] & bit)
                return true;
            blobByteMapped.data[blob].data[offset / 64] |= bit;
            return false;
        };

        bool collision = false;
        forEachLeafCoord<typename Mapping::RecordDim>([&](auto rc) constexpr
                                                      {
                                                          if(collision)
                                                              return;
                                                          for(auto ai : ArrayIndexRange{m.extents()})
                                                          {
                                                              using Type
                                                                  = GetType<typename Mapping::RecordDim, decltype(rc)>;
                                                              const auto [blob, offset] = m.blobNrAndOffset(ai, rc);
                                                              for(std::size_t b = 0; b < sizeof(Type); b++)
                                                                  if(testAndSet(blob, offset + b))
                                                                  {
                                                                      collision = true;
                                                                      break;
                                                                  }
                                                          }
                                                      });
        return !collision;
    }
#endif

    /// Proofs by exhaustion of the array and record dimensions, that at least PieceLength elements are always stored
    /// contiguously.
    // Unfortunately, this only works for smallish array dimensions, because of compiler limits on constexpr evaluation
    // depth.
    template<std::size_t PieceLength, typename Mapping>
    constexpr auto mapsPiecewiseContiguous(const Mapping& m) -> bool
    {
        bool collision = false;
        forEachLeafCoord<typename Mapping::RecordDim>([&](auto rc) constexpr
                                                      {
                                                          std::size_t flatIndex = 0;
                                                          std::size_t lastBlob
                                                              = std::numeric_limits<std::size_t>::max();
                                                          std::size_t lastOffset
                                                              = std::numeric_limits<std::size_t>::max();
                                                          for(auto ai : ArrayIndexRange{m.extents()})
                                                          {
                                                              using Type
                                                                  = GetType<typename Mapping::RecordDim, decltype(rc)>;
                                                              const auto [blob, offset] = m.blobNrAndOffset(ai, rc);
                                                              if(flatIndex % PieceLength != 0
                                                                 && (lastBlob != blob
                                                                     || lastOffset + sizeof(Type) != offset))
                                                              {
                                                                  collision = true;
                                                                  break;
                                                              }
                                                              lastBlob = blob;
                                                              lastOffset = offset;
                                                              flatIndex++;
                                                          }
                                                      });
        return !collision;
    }
} // namespace llama
// ==
// == ./Proofs.hpp ==
// ============================================================================

// ============================================================================
// == ./Vector.hpp ==
// ==
// SPDX-License-Identifier: GPL-3.0-or-later

// #pragma once
	// ============================================================================
	// == ./View.hpp ==
	// ==
	// Copyright 2018 Alexander Matthes
	// SPDX-License-Identifier: GPL-3.0-or-later

	// #pragma once
	// #include "Array.hpp"    // amalgamate: file already expanded
	// #include "ArrayIndexRange.hpp"    // amalgamate: file already expanded
		// ============================================================================
		// == ./BlobAllocators.hpp ==
		// ==
		// Copyright 2018 Alexander Matthes
		// SPDX-License-Identifier: GPL-3.0-or-later

		// #pragma once
		// #include "Array.hpp"    // amalgamate: file already expanded
			// ============================================================================
			// == ./Concepts.hpp ==
			// ==
			// #pragma once
			// #include "Array.hpp"    // amalgamate: file already expanded
			// #include "Core.hpp"    // amalgamate: file already expanded

			// #include <type_traits>    // amalgamate: file already included

			#if __has_include(<concepts>)
			#    include <concepts>
			#endif
			#ifdef __cpp_lib_concepts
			namespace llama
			{
			    // clang-format off
			    template <typename M>
			    concept Mapping = requires(M m) {
			        typename M::ArrayExtents;
			        typename M::ArrayIndex;
			        typename M::RecordDim;
			        { m.extents() } -> std::same_as<typename M::ArrayExtents>;
			        { M::blobCount } -> std::convertible_to<std::size_t>;
			        Array<int, M::blobCount>{}; // validates constexpr-ness
			        { m.blobSize(std::size_t{}) } -> std::same_as<std::size_t>;
			        { m.blobNrAndOffset(typename M::ArrayIndex{}) } -> std::same_as<NrAndOffset>;
			        { m.template blobNrAndOffset<0>(typename M::ArrayIndex{}) } -> std::same_as<NrAndOffset>;
			        { m.blobNrAndOffset(typename M::ArrayIndex{}, llama::RecordCoord<0>{}) } -> std::same_as<NrAndOffset>;
			    };
			    // clang-format on

			    template<typename B>
			    concept Blob = requires(B b, std::size_t i)
			    {
			        // according to http://eel.is/c++draft/intro.object#3 only std::byte and unsigned char can provide storage for
			        // other types
			        std::is_same_v<decltype(b[i]), std::byte&> || std::is_same_v<decltype(b[i]), unsigned char&>;
			    };

			    // clang-format off
			    template <typename BA>
			    concept BlobAllocator = requires(BA ba, std::integral_constant<std::size_t, 16> alignment, std::size_t size) {
			        { ba(alignment, size) } -> Blob;
			    };
			    // clang-format on
			} // namespace llama

			#endif
			// ==
			// == ./Concepts.hpp ==
			// ============================================================================

		// #include "macros.hpp"    // amalgamate: file already expanded

		#include <cstddef>
		#include <memory>
		#include <vector>
		#if defined(_LIBCPP_VERSION) && _LIBCPP_VERSION < 11000
		#    include <boost/shared_ptr.hpp>
		#endif

		namespace llama::bloballoc
		{
		    /// Allocates stack memory for a \ref View, which is copied each time a \ref View is copied.
		    /// \tparam BytesToReserve the amount of memory to reserve.
		    template<std::size_t BytesToReserve>
		    struct Stack
		    {
		        template<std::size_t Alignment>
		        LLAMA_FN_HOST_ACC_INLINE auto operator()(std::integral_constant<std::size_t, Alignment>, std::size_t) const
		        {
		            struct alignas(Alignment) AlignedArray : Array<std::byte, BytesToReserve>
		            {
		            };
		            return AlignedArray{};
		        }
		    };
		#ifdef __cpp_lib_concepts
		    static_assert(BlobAllocator<Stack<64>>);
		#endif

		    /// Allocates heap memory managed by a `std::shared_ptr` for a \ref View. This memory is shared between all copies
		    /// of a \ref View.
		    struct SharedPtr
		    {
		        // libc++ below 11.0.0 does not yet support shared_ptr with arrays
		        template<typename T>
		        using shared_ptr =
		#if defined(_LIBCPP_VERSION) && _LIBCPP_VERSION < 11000
		            boost::shared_ptr<T>;
		#else
		            std::shared_ptr<T>;
		#endif

		        template<std::size_t Alignment>
		        auto operator()(std::integral_constant<std::size_t, Alignment>, std::size_t count) const
		            -> shared_ptr<std::byte[]>
		        {
		            auto* ptr
		                = static_cast<std::byte*>(::operator new[](count * sizeof(std::byte), std::align_val_t{Alignment}));
		            auto deleter = [=](std::byte* ptr) { ::operator delete[](ptr, std::align_val_t{Alignment}); };
		            return shared_ptr<std::byte[]>{ptr, deleter};
		        }
		    };
		#ifdef __cpp_lib_concepts
		    static_assert(BlobAllocator<SharedPtr>);
		#endif

		    /// An STL compatible allocator allowing to specify alignment.
		    template<typename T, std::size_t Alignment>
		    struct AlignedAllocator
		    {
		        using value_type = T;

		        inline AlignedAllocator() noexcept = default;

		        template<typename T2>
		        inline explicit AlignedAllocator(AlignedAllocator<T2, Alignment> const&) noexcept
		        {
		        }

		        inline auto allocate(std::size_t n) -> T*
		        {
		            return static_cast<T*>(::operator new[](n * sizeof(T), std::align_val_t{Alignment}));
		        }

		        inline void deallocate(T* p, std::size_t)
		        {
		            ::operator delete[](p, std::align_val_t{Alignment});
		        }

		        template<typename T2>
		        struct rebind // NOLINT(readability-identifier-naming)
		        {
		            using other = AlignedAllocator<T2, Alignment>;
		        };

		        auto operator!=(const AlignedAllocator<T, Alignment>& other) const -> bool
		        {
		            return !(*this == other);
		        }

		        auto operator==(const AlignedAllocator<T, Alignment>&) const -> bool
		        {
		            return true;
		        }
		    };

		    /// Allocates heap memory managed by a `std::vector` for a \ref View, which is copied each time a \ref View is
		    /// copied.
		    struct Vector
		    {
		        template<std::size_t Alignment>
		        inline auto operator()(std::integral_constant<std::size_t, Alignment>, std::size_t count) const
		        {
		            return std::vector<std::byte, AlignedAllocator<std::byte, Alignment>>(count);
		        }
		    };
		#ifdef __cpp_lib_concepts
		    static_assert(BlobAllocator<Vector>);
		#endif
		} // namespace llama::bloballoc
		// ==
		// == ./BlobAllocators.hpp ==
		// ============================================================================

	// #include "Concepts.hpp"    // amalgamate: file already expanded
	// #include "Core.hpp"    // amalgamate: file already expanded
	// #include "macros.hpp"    // amalgamate: file already expanded
		// ============================================================================
		// == ./mapping/One.hpp ==
		// ==
		// Copyright 2018 Alexander Matthes
		// SPDX-License-Identifier: GPL-3.0-or-later

		// #pragma once
		// #include "../Core.hpp"    // amalgamate: file already expanded
			// ============================================================================
			// == ./mapping/Common.hpp ==
			// ==
			// Copyright 2018 Alexander Matthes
			// SPDX-License-Identifier: GPL-3.0-or-later

			// #pragma once
			// #include "../Core.hpp"    // amalgamate: file already expanded

			#include <climits>

			namespace llama::mapping
			{
			    /// Functor that maps an \ref ArrayIndex into linear numbers the way C++ arrays work. The fast moving index of the
			    /// ArrayIndex object should be the last one. E.g. ArrayIndex<3> a; stores 3 indices where a[2] should be
			    /// incremented in the innermost loop.
			    struct LinearizeArrayDimsCpp
			    {
			        template<typename ArrayExtents>
			        LLAMA_FN_HOST_ACC_INLINE constexpr auto size(const ArrayExtents& extents) -> std::size_t
			        {
			            return product(extents);
			        }

			        /// \param ai Index in the array dimensions.
			        /// \param extents Total size of the array dimensions.
			        /// \return Linearized index.
			        template<typename ArrayExtents>
			        LLAMA_FN_HOST_ACC_INLINE constexpr auto operator()(
			            const typename ArrayExtents::Index& ai,
			            const ArrayExtents& extents) const -> std::size_t
			        {
			            if constexpr(ArrayExtents::rank == 0)
			                return 0;
			            else
			            {
			                std::size_t address = ai[0];
			                for(std::size_t i = 1; i < ArrayExtents::rank; i++)
			                {
			                    address *= extents[i];
			                    address += ai[i];
			                }
			                return address;
			            }
			        }
			    };

			    /// Functor that maps a \ref ArrayIndex into linear numbers the way Fortran arrays work. The fast moving index of
			    /// the ArrayIndex object should be the last one. E.g. ArrayIndex<3> a; stores 3 indices where a[2] should be
			    /// incremented in the innermost loop.
			    struct LinearizeArrayDimsFortran
			    {
			        template<typename ArrayExtents>
			        LLAMA_FN_HOST_ACC_INLINE constexpr auto size(const ArrayExtents& extents) -> std::size_t
			        {
			            return product(extents);
			        }

			        /// \param ai Index in the array dimensions.
			        /// \param extents Total size of the array dimensions.
			        /// \return Linearized index.
			        template<typename ArrayExtents>
			        LLAMA_FN_HOST_ACC_INLINE constexpr auto operator()(
			            const typename ArrayExtents::Index& ai,
			            const ArrayExtents& extents) const -> std::size_t
			        {
			            if constexpr(ArrayExtents::rank == 0)
			                return 0;
			            else
			            {
			                std::size_t address = ai[ArrayExtents::rank - 1];
			                for(int i = static_cast<int>(ArrayExtents::rank) - 2; i >= 0; i--)
			                {
			                    address *= extents[i];
			                    address += ai[i];
			                }
			                return address;
			            }
			        }
			    };

			    /// Functor that maps an \ref ArrayIndex into linear numbers using the Z-order space filling curve (Morton codes).
			    struct LinearizeArrayDimsMorton
			    {
			        template<typename ArrayExtents>
			        LLAMA_FN_HOST_ACC_INLINE constexpr auto size(const ArrayExtents& extents) const -> std::size_t
			        {
			            if constexpr(ArrayExtents::rank == 0)
			                return 0;
			            else
			            {
			                std::size_t longest = extents[0];
			                for(std::size_t i = 1; i < ArrayExtents::rank; i++)
			                    longest = std::max(longest, extents[i]);
			                const auto longestPO2 = bit_ceil(longest);
			                return intPow(longestPO2, ArrayExtents::rank);
			            }
			        }

			        /// \param ai Coordinate in the array dimensions.
			        /// \param extents Total size of the array dimensions.
			        /// \return Linearized index.
			        template<typename ArrayExtents>
			        LLAMA_FN_HOST_ACC_INLINE constexpr auto operator()(
			            const typename ArrayExtents::Index& ai,
			            [[maybe_unused]] const ArrayExtents& extents) const -> std::size_t
			        {
			            std::size_t r = 0;
			            for(std::size_t bit = 0; bit < (sizeof(std::size_t) * CHAR_BIT) / ArrayExtents::rank; bit++)
			                for(std::size_t i = 0; i < ArrayExtents::rank; i++)
			                    r |= (ai[i] & (std::size_t{1} << bit)) << ((bit + 1) * (ArrayExtents::rank - 1) - i);
			            return r;
			        }

			    private:
			        LLAMA_FN_HOST_ACC_INLINE static constexpr auto bit_ceil(std::size_t n) -> std::size_t
			        {
			            std::size_t r = 1;
			            while(r < n)
			                r <<= 1u;
			            return r;
			        }

			        LLAMA_FN_HOST_ACC_INLINE static constexpr auto intPow(std::size_t b, std::size_t e) -> std::size_t
			        {
			            e--;
			            auto r = b;
			            while(e != 0u)
			            {
			                r *= b;
			                e--;
			            }
			            return r;
			        }
			    };

			    /// Flattens the record dimension in the order fields are written.
			    template<typename RecordDim>
			    struct FlattenRecordDimInOrder
			    {
			        using FlatRecordDim = llama::FlatRecordDim<RecordDim>;

			        template<std::size_t... RecordCoords>
			        static constexpr std::size_t flatIndex = flatRecordCoord<RecordDim, RecordCoord<RecordCoords...>>;
			    };

			    /// Flattens the record dimension by sorting the fields according to a given predicate on the field types.
			    /// @tparam Less A binary predicate accepting two field types, which exposes a member value. Value must be true if
			    /// the first field type is less than the second one, otherwise false.
			    template<typename RecordDim, template<typename, typename> typename Less>
			    struct FlattenRecordDimSorted
			    {
			    private:
			        using FlatOrigRecordDim = llama::FlatRecordDim<RecordDim>;
			        using FlatSortedRecordDim = boost::mp11::mp_sort<FlatOrigRecordDim, Less>;

			        template<typename A, typename B>
			        using LessWithIndices
			            = Less<boost::mp11::mp_at<FlatOrigRecordDim, A>, boost::mp11::mp_at<FlatOrigRecordDim, B>>;

			        // A permutation from new FlatSortedRecordDim index to old FlatOrigRecordDim index
			        using PermutedIndices
			            = boost::mp11::mp_sort<boost::mp11::mp_iota<boost::mp11::mp_size<FlatOrigRecordDim>>, LessWithIndices>;

			        template<typename A, typename B>
			        using LessInvertPermutation = std::bool_constant<(
			            boost::mp11::mp_at<PermutedIndices, A>::value < boost::mp11::mp_at<PermutedIndices, B>::value)>;

			        // A permutation from old FlatOrigRecordDim index to new FlatSortedRecordDim index
			        using InversePermutedIndices = boost::mp11::
			            mp_sort<boost::mp11::mp_iota<boost::mp11::mp_size<FlatOrigRecordDim>>, LessInvertPermutation>;

			    public:
			        using FlatRecordDim = FlatSortedRecordDim;

			        template<std::size_t... RecordCoords>
			        static constexpr std::size_t flatIndex = []() constexpr
			        {
			            constexpr auto indexBefore = flatRecordCoord<RecordDim, RecordCoord<RecordCoords...>>;
			            constexpr auto indexAfter = boost::mp11::mp_at_c<InversePermutedIndices, indexBefore>::value;
			            return indexAfter;
			        }
			        ();
			    };

			    namespace internal
			    {
			        template<typename A, typename B>
			        using LessAlignment = std::bool_constant<alignof(A) < alignof(B)>;

			        template<typename A, typename B>
			        using MoreAlignment = std::bool_constant<(alignof(A) > alignof(B))>;
			    } // namespace internal

			    /// Flattens and sorts the record dimension by increasing alignment of its fields.
			    template<typename RecordDim>
			    using FlattenRecordDimIncreasingAlignment = FlattenRecordDimSorted<RecordDim, internal::LessAlignment>;

			    /// Flattens and sorts the record dimension by decreasing alignment of its fields.
			    template<typename RecordDim>
			    using FlattenRecordDimDecreasingAlignment = FlattenRecordDimSorted<RecordDim, internal::MoreAlignment>;

			    /// Flattens and sorts the record dimension by the alignment of its fields to minimize padding.
			    template<typename RecordDim>
			    using FlattenRecordDimMinimizePadding = FlattenRecordDimIncreasingAlignment<RecordDim>;
			} // namespace llama::mapping
			// ==
			// == ./mapping/Common.hpp ==
			// ============================================================================


		namespace llama::mapping
		{
		    /// Maps all array dimension indices to the same location and layouts struct members consecutively. This mapping is
		    /// used for temporary, single element views.
		    /// \tparam AlignAndPad If true, padding bytes are inserted to guarantee that struct members are properly aligned.
		    /// If false, struct members are tightly packed.
		    /// \tparam FlattenRecordDim Defines how the record dimension's fields should be flattened. See \ref
		    /// FlattenRecordDimInOrder, \ref FlattenRecordDimIncreasingAlignment, \ref FlattenRecordDimDecreasingAlignment and
		    /// \ref FlattenRecordDimMinimizePadding.
		    template<
		        typename TArrayExtents,
		        typename TRecordDim,
		        bool AlignAndPad = true,
		        template<typename> typename FlattenRecordDim = FlattenRecordDimMinimizePadding>
		    struct One : TArrayExtents
		    {
		        using ArrayExtents = TArrayExtents;
		        using ArrayIndex = typename ArrayExtents::Index;
		        using RecordDim = TRecordDim;

		        static constexpr std::size_t blobCount = 1;

		        constexpr One() = default;

		        LLAMA_FN_HOST_ACC_INLINE
		        constexpr explicit One(ArrayExtents extents, RecordDim = {}) : ArrayExtents(extents)
		        {
		        }

		        LLAMA_FN_HOST_ACC_INLINE constexpr auto extents() const -> ArrayExtents
		        {
		            return ArrayExtents{*this};
		        }

		        LLAMA_FN_HOST_ACC_INLINE constexpr auto blobSize(std::size_t) const -> std::size_t
		        {
		            return flatSizeOf<typename Flattener::FlatRecordDim, AlignAndPad, false>; // no tail padding
		        }

		        template<std::size_t... RecordCoords>
		        LLAMA_FN_HOST_ACC_INLINE constexpr auto blobNrAndOffset(ArrayIndex, RecordCoord<RecordCoords...> = {}) const
		            -> NrAndOffset
		        {
		            constexpr std::size_t flatFieldIndex =
		#ifdef __NVCC__
		                *& // mess with nvcc compiler state to workaround bug
		#endif
		                 Flattener::template flatIndex<RecordCoords...>;
		            constexpr auto offset = flatOffsetOf<typename Flattener::FlatRecordDim, flatFieldIndex, AlignAndPad>;
		            return {0, offset};
		        }

		    private:
		        using Flattener = FlattenRecordDim<TRecordDim>;
		    };

		    /// One mapping preserving the alignment of the field types by inserting padding.
		    /// \see One
		    template<typename ArrayExtents, typename RecordDim>
		    using AlignedOne = One<ArrayExtents, RecordDim, true, FlattenRecordDimInOrder>;

		    /// One mapping preserving the alignment of the field types by inserting padding and permuting the field order to
		    /// minimize this padding.
		    /// \see One
		    template<typename ArrayExtents, typename RecordDim>
		    using MinAlignedOne = One<ArrayExtents, RecordDim, true, FlattenRecordDimMinimizePadding>;

		    /// One mapping packing the field types tightly, violating the types' alignment requirements.
		    /// \see One
		    template<typename ArrayExtents, typename RecordDim>
		    using PackedOne = One<ArrayExtents, RecordDim, false, FlattenRecordDimInOrder>;

		    template<typename Mapping>
		    inline constexpr bool isOne = false;

		    template<typename ArrayExtents, typename RecordDim, bool AlignAndPad, template<typename> typename FlattenRecordDim>
		    inline constexpr bool isOne<One<ArrayExtents, RecordDim, AlignAndPad, FlattenRecordDim>> = true;
		} // namespace llama::mapping
		// ==
		// == ./mapping/One.hpp ==
		// ============================================================================


	// #include <type_traits>    // amalgamate: file already included

	namespace llama
	{
	#ifdef __cpp_lib_concepts
	    template<typename TMapping, Blob BlobType>
	#else
	    template<typename TMapping, typename BlobType>
	#endif
	    struct View;

	    namespace internal
	    {
	        template<typename Allocator, typename RecordDim>
	        using AllocatorBlobType
	            = decltype(std::declval<Allocator>()(std::integral_constant<std::size_t, alignOf<RecordDim>>{}, 0));

	        LLAMA_SUPPRESS_HOST_DEVICE_WARNING
	        template<typename Allocator, typename Mapping, std::size_t... Is>
	        LLAMA_FN_HOST_ACC_INLINE auto makeBlobArray(
	            const Allocator& alloc,
	            const Mapping& mapping,
	            std::integer_sequence<std::size_t, Is...>)
	            -> Array<AllocatorBlobType<Allocator, typename Mapping::RecordDim>, Mapping::blobCount>
	        {
	            [[maybe_unused]] constexpr auto alignment
	                = alignOf<typename Mapping::RecordDim>; // g++-12 warns that alignment is unsed
	            return {alloc(std::integral_constant<std::size_t, alignment>{}, mapping.blobSize(Is))...};
	        }
	    } // namespace internal

	    /// Same as \ref allocView but does not run field constructors.
	#ifdef __cpp_lib_concepts
	    template<typename Mapping, BlobAllocator Allocator = bloballoc::Vector>
	#else
	    template<typename Mapping, typename Allocator = bloballoc::Vector>
	#endif
	    LLAMA_FN_HOST_ACC_INLINE auto allocViewUninitialized(Mapping mapping = {}, const Allocator& alloc = {})
	        -> View<Mapping, internal::AllocatorBlobType<Allocator, typename Mapping::RecordDim>>
	    {
	        auto blobs = internal::makeBlobArray(alloc, mapping, std::make_index_sequence<Mapping::blobCount>{});
	        return {std::move(mapping), std::move(blobs)};
	    }

	    namespace internal
	    {
	        template<typename Mapping, typename RecordCoord, typename = void>
	        struct IsComputed : std::false_type
	        {
	        };

	        template<typename Mapping, typename RecordCoord>
	        struct IsComputed<Mapping, RecordCoord, std::void_t<decltype(Mapping::isComputed(RecordCoord{}))>>
	            : std::bool_constant<Mapping::isComputed(RecordCoord{})>
	        {
	        };
	    } // namespace internal

	    /// Returns true if the field accessed via the given mapping and record coordinate is a computed value.
	    template<typename Mapping, typename RecordCoord>
	    inline constexpr bool isComputed = internal::IsComputed<Mapping, RecordCoord>::value;

	    /// Runs the constructor of all fields reachable through the given view. Computed fields are not constructed.
	    template<typename Mapping, typename BlobType>
	    LLAMA_FN_HOST_ACC_INLINE void constructFields(View<Mapping, BlobType>& view)
	    {
	        using View = View<Mapping, BlobType>;
	        using RecordDim = typename View::RecordDim;
	        forEachADCoord(
	            view.mapping().extents(),
	            [&](typename View::ArrayIndex ai)
	            {
	                if constexpr(isRecord<RecordDim> || internal::IsBoundedArray<RecordDim>::value)
	                    forEachLeafCoord<RecordDim>(
	                        [&](auto rc)
	                        {
	                            // TODO(bgruber): we could initialize computed fields if we can write to those. We could
	                            // test if the returned value can be cast to a T& and then attempt to write.
	                            if constexpr(!isComputed<Mapping, decltype(rc)>)
	                                new(&view(ai)(rc)) GetType<RecordDim, decltype(rc)>;
	                        });
	                else if constexpr(!isComputed<Mapping, RecordCoord<>>)
	                    new(&view(ai)) RecordDim;
	            });
	    }

	    /// Creates a view based on the given mapping, e.g. \ref AoS or \ref :SoA. For allocating the view's underlying
	    /// memory, the specified allocator callable is used (or the default one, which is \ref bloballoc::Vector). The
	    /// allocator callable is called with the alignment and size of bytes to allocate for each blob of the mapping.
	    /// The constructors are run for all fields by calling \ref constructFields. This function is the preferred way to
	    /// create a \ref View. See also \ref allocViewUninitialized.
	#ifdef __cpp_lib_concepts
	    template<typename Mapping, BlobAllocator Allocator = bloballoc::Vector>
	#else
	    template<typename Mapping, typename Allocator = bloballoc::Vector>
	#endif
	    LLAMA_FN_HOST_ACC_INLINE auto allocView(Mapping mapping = {}, const Allocator& alloc = {})
	        -> View<Mapping, internal::AllocatorBlobType<Allocator, typename Mapping::RecordDim>>
	    {
	        auto view = allocViewUninitialized(std::move(mapping), alloc);
	        constructFields(view);
	        return view;
	    }

	    /// Allocates a \ref View holding a single record backed by stack memory (\ref bloballoc::Stack).
	    /// \tparam Dim Dimension of the \ref ArrayExtents of the \ref View.
	    template<std::size_t Dim, typename RecordDim>
	    LLAMA_FN_HOST_ACC_INLINE auto allocViewStack() -> decltype(auto)
	    {
	        constexpr auto mapping = mapping::MinAlignedOne<ArrayExtentsStatic<Dim, 1>, RecordDim>{};
	        return allocView(mapping, bloballoc::Stack<mapping.blobSize(0)>{});
	    }

	    template<typename View, typename BoundRecordCoord = RecordCoord<>, bool OwnView = false>
	    struct VirtualRecord;

	    /// A \ref VirtualRecord that owns and holds a single value.
	    template<typename RecordDim>
	    using One = VirtualRecord<decltype(allocViewStack<0, RecordDim>()), RecordCoord<>, true>;

	    // TODO(bgruber): Higher dimensional iterators might not have good codegen. Multiple nested loops seem to be
	    // superior to a single iterator over multiple dimensions. At least compilers are able to produce better code.
	    // std::mdspan also discovered similar difficulties and there was a discussion in WG21 in Oulu 2016 to
	    // remove/postpone iterators from the design. In std::mdspan's design, the iterator iterated over the co-domain.
	    template<typename View>
	    struct Iterator
	    {
	        using ArrayIndexIterator = llama::ArrayIndexIterator<typename View::ArrayExtents>;

	        using iterator_category = std::random_access_iterator_tag;
	        using value_type = One<typename View::RecordDim>;
	        using difference_type = typename ArrayIndexIterator::difference_type;
	        using pointer = internal::IndirectValue<VirtualRecord<View>>;
	        using reference = VirtualRecord<View>;

	        constexpr Iterator() = default;

	        LLAMA_FN_HOST_ACC_INLINE constexpr Iterator(ArrayIndexIterator arrayIndex, View* view)
	            : arrayIndex(arrayIndex)
	            , view(view)
	        {
	        }

	        LLAMA_FN_HOST_ACC_INLINE
	        constexpr auto operator++() -> Iterator&
	        {
	            ++arrayIndex;
	            return *this;
	        }

	        LLAMA_FN_HOST_ACC_INLINE
	        constexpr auto operator++(int) -> Iterator
	        {
	            auto tmp = *this;
	            ++*this;
	            return tmp;
	        }

	        LLAMA_FN_HOST_ACC_INLINE
	        constexpr auto operator--() -> Iterator&
	        {
	            --arrayIndex;
	            return *this;
	        }

	        LLAMA_FN_HOST_ACC_INLINE
	        constexpr auto operator--(int) -> Iterator
	        {
	            auto tmp{*this};
	            --*this;
	            return tmp;
	        }

	        LLAMA_FN_HOST_ACC_INLINE
	        constexpr auto operator*() const -> reference
	        {
	            return (*view)(*arrayIndex);
	        }

	        LLAMA_FN_HOST_ACC_INLINE
	        constexpr auto operator->() const -> pointer
	        {
	            return {**this};
	        }

	        LLAMA_FN_HOST_ACC_INLINE
	        constexpr auto operator[](difference_type i) const -> reference
	        {
	            return *(*this + i);
	        }

	        LLAMA_FN_HOST_ACC_INLINE
	        constexpr auto operator+=(difference_type n) -> Iterator&
	        {
	            arrayIndex += n;
	            return *this;
	        }

	        LLAMA_FN_HOST_ACC_INLINE
	        friend constexpr auto operator+(Iterator it, difference_type n) -> Iterator
	        {
	            it += n;
	            return it;
	        }

	        LLAMA_FN_HOST_ACC_INLINE
	        friend constexpr auto operator+(difference_type n, Iterator it) -> Iterator
	        {
	            return it + n;
	        }

	        LLAMA_FN_HOST_ACC_INLINE
	        constexpr auto operator-=(difference_type n) -> Iterator&
	        {
	            arrayIndex -= n;
	            return *this;
	        }

	        LLAMA_FN_HOST_ACC_INLINE
	        friend constexpr auto operator-(Iterator it, difference_type n) -> Iterator
	        {
	            it -= n;
	            return it;
	        }

	        LLAMA_FN_HOST_ACC_INLINE
	        friend constexpr auto operator-(const Iterator& a, const Iterator& b) -> difference_type
	        {
	            assert(a.view == b.view);
	            return static_cast<std::ptrdiff_t>(a.arrayIndex - b.arrayIndex);
	        }

	        LLAMA_FN_HOST_ACC_INLINE
	        friend constexpr auto operator==(const Iterator& a, const Iterator& b) -> bool
	        {
	            assert(a.view == b.view);
	            return a.arrayIndex == b.arrayIndex;
	        }

	        LLAMA_FN_HOST_ACC_INLINE
	        friend constexpr auto operator!=(const Iterator& a, const Iterator& b) -> bool
	        {
	            return !(a == b);
	        }

	        LLAMA_FN_HOST_ACC_INLINE
	        friend constexpr auto operator<(const Iterator& a, const Iterator& b) -> bool
	        {
	            assert(a.view == b.view);
	            return a.arrayIndex < b.arrayIndex;
	        }

	        LLAMA_FN_HOST_ACC_INLINE
	        friend constexpr auto operator>(const Iterator& a, const Iterator& b) -> bool
	        {
	            return b < a;
	        }

	        LLAMA_FN_HOST_ACC_INLINE
	        friend constexpr auto operator<=(const Iterator& a, const Iterator& b) -> bool
	        {
	            return !(a > b);
	        }

	        LLAMA_FN_HOST_ACC_INLINE
	        friend constexpr auto operator>=(const Iterator& a, const Iterator& b) -> bool
	        {
	            return !(a < b);
	        }

	        ArrayIndexIterator arrayIndex;
	        View* view;
	    };

	    /// Central LLAMA class holding memory for storage and giving access to values stored there defined by a mapping. A
	    /// view should be created using \ref allocView.
	    /// \tparam TMapping The mapping used by the view to map accesses into memory.
	    /// \tparam BlobType The storage type used by the view holding memory.
	#ifdef __cpp_lib_concepts
	    template<typename TMapping, Blob BlobType>
	#else
	    template<typename TMapping, typename BlobType>
	#endif
	    struct View
	        : private TMapping
	#if CAN_USE_RANGES
	        , std::ranges::view_base
	#endif
	    {
	        static_assert(!std::is_const_v<TMapping>);
	        using Mapping = TMapping;
	        using ArrayExtents = typename Mapping::ArrayExtents;
	        using ArrayIndex = typename Mapping::ArrayIndex;
	        using RecordDim = typename Mapping::RecordDim;
	        using iterator = Iterator<View>;
	        using const_iterator = Iterator<const View>;

	        static_assert(
	            std::is_same_v<Mapping, std::decay_t<Mapping>>,
	            "Mapping must not be const qualified or a reference. Are you using decltype(...) as View template "
	            "argument?");
	        static_assert(
	            std::is_same_v<ArrayExtents, std::decay_t<ArrayExtents>>,
	            "Mapping::ArrayExtents must not be const qualified or a reference. Are you using decltype(...) as mapping "
	            "template argument?");

	        View() = default;

	        LLAMA_FN_HOST_ACC_INLINE
	        View(Mapping mapping, Array<BlobType, Mapping::blobCount> storageBlobs)
	            : Mapping(std::move(mapping))
	            , storageBlobs(std::move(storageBlobs))
	        {
	        }

	        LLAMA_FN_HOST_ACC_INLINE auto mapping() -> Mapping&
	        {
	            return static_cast<Mapping&>(*this);
	        }

	        LLAMA_FN_HOST_ACC_INLINE auto mapping() const -> const Mapping&
	        {
	            return static_cast<const Mapping&>(*this);
	        }

	        /// Retrieves the \ref VirtualRecord at the given \ref ArrayIndex index.
	        LLAMA_FN_HOST_ACC_INLINE auto operator()(ArrayIndex ai) const -> decltype(auto)
	        {
	            if constexpr(isRecord<RecordDim> || internal::IsBoundedArray<RecordDim>::value)
	            {
	                LLAMA_FORCE_INLINE_RECURSIVE
	                return VirtualRecord<const View>{ai, *this};
	            }
	            else
	            {
	                LLAMA_FORCE_INLINE_RECURSIVE
	                return accessor(ai, RecordCoord<>{});
	            }
	        }

	        LLAMA_FN_HOST_ACC_INLINE auto operator()(ArrayIndex ai) -> decltype(auto)
	        {
	            if constexpr(isRecord<RecordDim> || internal::IsBoundedArray<RecordDim>::value)
	            {
	                LLAMA_FORCE_INLINE_RECURSIVE
	                return VirtualRecord<View>{ai, *this};
	            }
	            else
	            {
	                LLAMA_FORCE_INLINE_RECURSIVE
	                return accessor(ai, RecordCoord<>{});
	            }
	        }

	        /// Retrieves the \ref VirtualRecord at the \ref ArrayIndex index constructed from the passed component
	        /// indices.
	        template<typename... Indices>
	        LLAMA_FN_HOST_ACC_INLINE auto operator()(Indices... indices) const -> decltype(auto)
	        {
	            static_assert(
	                sizeof...(Indices) == ArrayIndex::rank,
	                "Please specify as many indices as you have array dimensions");
	            static_assert(
	                std::conjunction_v<std::is_convertible<Indices, std::size_t>...>,
	                "Indices must be convertible to std::size_t");
	            LLAMA_FORCE_INLINE_RECURSIVE
	            return (*this)(ArrayIndex{static_cast<typename ArrayIndex::value_type>(indices)...});
	        }

	        template<typename... Indices>
	        LLAMA_FN_HOST_ACC_INLINE auto operator()(Indices... indices) -> decltype(auto)
	        {
	            static_assert(
	                sizeof...(Indices) == ArrayIndex::rank,
	                "Please specify as many indices as you have array dimensions");
	            static_assert(
	                std::conjunction_v<std::is_convertible<Indices, std::size_t>...>,
	                "Indices must be convertible to std::size_t");
	            LLAMA_FORCE_INLINE_RECURSIVE
	            return (*this)(ArrayIndex{static_cast<typename ArrayIndex::value_type>(indices)...});
	        }

	        /// Retrieves the \ref VirtualRecord at the \ref ArrayIndex index constructed from the passed component
	        /// indices.
	        LLAMA_FN_HOST_ACC_INLINE auto operator[](ArrayIndex ai) const -> decltype(auto)
	        {
	            LLAMA_FORCE_INLINE_RECURSIVE
	            return (*this)(ai);
	        }

	        LLAMA_FN_HOST_ACC_INLINE auto operator[](ArrayIndex ai) -> decltype(auto)
	        {
	            LLAMA_FORCE_INLINE_RECURSIVE
	            return (*this)(ai);
	        }

	        /// Retrieves the \ref VirtualRecord at the 1D \ref ArrayIndex index constructed from the passed index.
	        LLAMA_FN_HOST_ACC_INLINE auto operator[](std::size_t index) const -> decltype(auto)
	        {
	            LLAMA_FORCE_INLINE_RECURSIVE
	            return (*this)(index);
	        }

	        LLAMA_FN_HOST_ACC_INLINE auto operator[](std::size_t index) -> decltype(auto)
	        {
	            LLAMA_FORCE_INLINE_RECURSIVE
	            return (*this)(index);
	        }

	        LLAMA_FN_HOST_ACC_INLINE
	        auto begin() -> iterator
	        {
	            return {ArrayIndexRange<ArrayExtents>{mapping().extents()}.begin(), this};
	        }

	        LLAMA_FN_HOST_ACC_INLINE
	        auto begin() const -> const_iterator
	        {
	            return {ArrayIndexRange<ArrayExtents>{mapping().extents()}.begin(), this};
	        }

	        LLAMA_FN_HOST_ACC_INLINE
	        auto end() -> iterator
	        {
	            return {ArrayIndexRange<ArrayExtents>{mapping().extents()}.end(), this};
	        }

	        LLAMA_FN_HOST_ACC_INLINE
	        auto end() const -> const_iterator
	        {
	            return {ArrayIndexRange<ArrayExtents>{mapping().extents()}.end(), this};
	        }

	        Array<BlobType, Mapping::blobCount> storageBlobs;

	    private:
	        template<typename TView, typename TBoundRecordCoord, bool OwnView>
	        friend struct VirtualRecord;

	        LLAMA_SUPPRESS_HOST_DEVICE_WARNING
	        template<std::size_t... Coords>
	        LLAMA_FN_HOST_ACC_INLINE auto accessor(ArrayIndex ai, RecordCoord<Coords...> rc = {}) const -> decltype(auto)
	        {
	            if constexpr(llama::isComputed<Mapping, RecordCoord<Coords...>>)
	                return mapping().compute(ai, rc, storageBlobs);
	            else
	            {
	                const auto [nr, offset] = mapping().blobNrAndOffset(ai, rc);
	                using Type = GetType<RecordDim, RecordCoord<Coords...>>;
	                return reinterpret_cast<const Type&>(storageBlobs[nr][offset]);
	            }
	        }

	        LLAMA_SUPPRESS_HOST_DEVICE_WARNING
	        template<std::size_t... Coords>
	        LLAMA_FN_HOST_ACC_INLINE auto accessor(ArrayIndex ai, RecordCoord<Coords...> rc = {}) -> decltype(auto)
	        {
	            if constexpr(llama::isComputed<Mapping, RecordCoord<Coords...>>)
	                return mapping().compute(ai, rc, storageBlobs);
	            else
	            {
	                const auto [nr, offset] = mapping().blobNrAndOffset(ai, rc);
	                using Type = GetType<RecordDim, RecordCoord<Coords...>>;
	                using QualifiedType = std::conditional_t<
	                    std::is_const_v<std::remove_reference_t<decltype(storageBlobs[nr][offset])>>,
	                    const Type,
	                    Type>;
	                return reinterpret_cast<QualifiedType&>(storageBlobs[nr][offset]);
	            }
	        }
	    };

	    template<typename View>
	    inline constexpr auto IsView = false;

	    template<typename Mapping, typename BlobType>
	    inline constexpr auto IsView<View<Mapping, BlobType>> = true;

	    /// Acts like a \ref View, but shows only a smaller and/or shifted part of another view it references, the parent
	    /// view.
	    template<typename TParentView>
	    struct VirtualView
	    {
	        using ParentView = TParentView; ///< type of the parent view
	        using Mapping = typename ParentView::Mapping; ///< mapping of the parent view
	        using ArrayExtents = typename Mapping::ArrayExtents; ///< array extents of the parent view
	        using ArrayIndex = typename Mapping::ArrayIndex; ///< array index of the parent view

	        /// Creates a VirtualView given a parent \ref View and offset.
	        LLAMA_FN_HOST_ACC_INLINE
	        VirtualView(ParentView& parentView, ArrayIndex offset) : parentView(parentView), offset(offset)
	        {
	        }

	        template<std::size_t... Coords>
	        LLAMA_FN_HOST_ACC_INLINE auto accessor(ArrayIndex ai) const -> const auto&
	        {
	            return parentView.template accessor<Coords...>(ArrayIndex{ai + offset});
	        }

	        template<std::size_t... Coords>
	        LLAMA_FN_HOST_ACC_INLINE auto accessor(ArrayIndex ai) -> auto&
	        {
	            return parentView.template accessor<Coords...>(ArrayIndex{ai + offset});
	        }

	        /// Same as \ref View::operator()(ArrayIndex), but shifted by the offset of this \ref VirtualView.
	        LLAMA_FN_HOST_ACC_INLINE auto operator()(ArrayIndex ai) const -> decltype(auto)
	        {
	            LLAMA_FORCE_INLINE_RECURSIVE
	            return parentView(ArrayIndex{ai + offset});
	        }

	        LLAMA_FN_HOST_ACC_INLINE auto operator()(ArrayIndex ai) -> decltype(auto)
	        {
	            LLAMA_FORCE_INLINE_RECURSIVE
	            return parentView(ArrayIndex{ai + offset});
	        }

	        /// Same as corresponding operator in \ref View, but shifted by the offset of this \ref VirtualView.
	        template<typename... Indices>
	        LLAMA_FN_HOST_ACC_INLINE auto operator()(Indices... indices) const -> decltype(auto)
	        {
	            static_assert(
	                sizeof...(Indices) == ArrayIndex::rank,
	                "Please specify as many indices as you have array dimensions");
	            static_assert(
	                std::conjunction_v<std::is_convertible<Indices, std::size_t>...>,
	                "Indices must be convertible to std::size_t");
	            LLAMA_FORCE_INLINE_RECURSIVE
	            return parentView(
	                ArrayIndex{ArrayIndex{static_cast<typename ArrayIndex::value_type>(indices)...} + offset});
	        }

	        template<typename... Indices>
	        LLAMA_FN_HOST_ACC_INLINE auto operator()(Indices... indices) -> decltype(auto)
	        {
	            static_assert(
	                sizeof...(Indices) == ArrayIndex::rank,
	                "Please specify as many indices as you have array dimensions");
	            static_assert(
	                std::conjunction_v<std::is_convertible<Indices, std::size_t>...>,
	                "Indices must be convertible to std::size_t");
	            LLAMA_FORCE_INLINE_RECURSIVE
	            return parentView(
	                ArrayIndex{ArrayIndex{static_cast<typename ArrayIndex::value_type>(indices)...} + offset});
	        }

	        template<std::size_t... Coord>
	        LLAMA_FN_HOST_ACC_INLINE auto operator()(RecordCoord<Coord...> = {}) const -> decltype(auto)
	        {
	            LLAMA_FORCE_INLINE_RECURSIVE
	            return accessor<Coord...>(ArrayIndex{});
	        }

	        template<std::size_t... Coord>
	        LLAMA_FN_HOST_ACC_INLINE auto operator()(RecordCoord<Coord...> = {}) -> decltype(auto)
	        {
	            LLAMA_FORCE_INLINE_RECURSIVE
	            return accessor<Coord...>(ArrayIndex{});
	        }

	        ParentView& parentView; ///< reference to parent view.
	        const ArrayIndex
	            offset; ///< offset this view's \ref ArrayIndex indices are shifted when passed to the parent view.
	    };
	} // namespace llama
	// ==
	// == ./View.hpp ==
	// ============================================================================

	// ============================================================================
	// == ./VirtualRecord.hpp ==
	// ==
	// Copyright 2018 Alexander Matthes
	// SPDX-License-Identifier: GPL-3.0-or-later

	// #pragma once
	// #include "View.hpp"    // amalgamate: file already expanded

	#include <iosfwd>
	// #include <type_traits>    // amalgamate: file already included

	namespace llama
	{
	    template<typename View, typename BoundRecordCoord, bool OwnView>
	    struct VirtualRecord;

	    template<typename View>
	    inline constexpr auto is_VirtualRecord = false;

	    template<typename View, typename BoundRecordCoord, bool OwnView>
	    inline constexpr auto is_VirtualRecord<VirtualRecord<View, BoundRecordCoord, OwnView>> = true;

	    /// Creates a single \ref VirtualRecord owning a view with stack memory and copies all values from an existing \ref
	    /// VirtualRecord.
	    template<typename VirtualRecord>
	    LLAMA_FN_HOST_ACC_INLINE auto copyVirtualRecordStack(const VirtualRecord& vd) -> decltype(auto)
	    {
	        One<typename VirtualRecord::AccessibleRecordDim> temp;
	        temp = vd;
	        return temp;
	    }

	    namespace internal
	    {
	        template<
	            typename Functor,
	            typename LeftRecord,
	            typename RightView,
	            typename RightBoundRecordDim,
	            bool RightOwnView>
	        LLAMA_FN_HOST_ACC_INLINE auto virtualRecordArithOperator(
	            LeftRecord& left,
	            const VirtualRecord<RightView, RightBoundRecordDim, RightOwnView>& right) -> LeftRecord&
	        {
	            using RightRecord = VirtualRecord<RightView, RightBoundRecordDim, RightOwnView>;
	            // if the record dimension left and right is the same, a single loop is enough and no tag check is needed.
	            // this safes a lot of compilation time.
	            if constexpr(std::is_same_v<
	                             typename LeftRecord::AccessibleRecordDim,
	                             typename RightRecord::AccessibleRecordDim>)
	            {
	                forEachLeafCoord<typename LeftRecord::AccessibleRecordDim>([&](auto rc) LLAMA_LAMBDA_INLINE
	                                                                           { Functor{}(left(rc), right(rc)); });
	            }
	            else
	            {
	                forEachLeafCoord<typename LeftRecord::AccessibleRecordDim>(
	                    [&](auto leftRC) LLAMA_LAMBDA_INLINE
	                    {
	                        using LeftInnerCoord = decltype(leftRC);
	                        forEachLeafCoord<typename RightRecord::AccessibleRecordDim>(
	                            [&](auto rightRC) LLAMA_LAMBDA_INLINE
	                            {
	                                using RightInnerCoord = decltype(rightRC);
	                                if constexpr(hasSameTags<
	                                                 typename LeftRecord::AccessibleRecordDim,
	                                                 LeftInnerCoord,
	                                                 typename RightRecord::AccessibleRecordDim,
	                                                 RightInnerCoord>)
	                                {
	                                    Functor{}(left(leftRC), right(rightRC));
	                                }
	                            });
	                    });
	            }
	            return left;
	        }

	        template<typename Functor, typename LeftRecord, typename T>
	        LLAMA_FN_HOST_ACC_INLINE auto virtualRecordArithOperator(LeftRecord& left, const T& right) -> LeftRecord&
	        {
	            forEachLeafCoord<typename LeftRecord::AccessibleRecordDim>([&](auto leftRC) LLAMA_LAMBDA_INLINE
	                                                                       { Functor{}(left(leftRC), right); });
	            return left;
	        }

	        template<
	            typename Functor,
	            typename LeftRecord,
	            typename RightView,
	            typename RightBoundRecordDim,
	            bool RightOwnView>
	        LLAMA_FN_HOST_ACC_INLINE auto virtualRecordRelOperator(
	            const LeftRecord& left,
	            const VirtualRecord<RightView, RightBoundRecordDim, RightOwnView>& right) -> bool
	        {
	            using RightRecord = VirtualRecord<RightView, RightBoundRecordDim, RightOwnView>;
	            bool result = true;
	            // if the record dimension left and right is the same, a single loop is enough and no tag check is needed.
	            // this safes a lot of compilation time.
	            if constexpr(std::is_same_v<
	                             typename LeftRecord::AccessibleRecordDim,
	                             typename RightRecord::AccessibleRecordDim>)
	            {
	                forEachLeafCoord<typename LeftRecord::AccessibleRecordDim>(
	                    [&](auto rc) LLAMA_LAMBDA_INLINE { result &= Functor{}(left(rc), right(rc)); });
	            }
	            else
	            {
	                forEachLeafCoord<typename LeftRecord::AccessibleRecordDim>(
	                    [&](auto leftRC) LLAMA_LAMBDA_INLINE
	                    {
	                        using LeftInnerCoord = decltype(leftRC);
	                        forEachLeafCoord<typename RightRecord::AccessibleRecordDim>(
	                            [&](auto rightRC) LLAMA_LAMBDA_INLINE
	                            {
	                                using RightInnerCoord = decltype(rightRC);
	                                if constexpr(hasSameTags<
	                                                 typename LeftRecord::AccessibleRecordDim,
	                                                 LeftInnerCoord,
	                                                 typename RightRecord::AccessibleRecordDim,
	                                                 RightInnerCoord>)
	                                {
	                                    result &= Functor{}(left(leftRC), right(rightRC));
	                                }
	                            });
	                    });
	            }
	            return result;
	        }

	        template<typename Functor, typename LeftRecord, typename T>
	        LLAMA_FN_HOST_ACC_INLINE auto virtualRecordRelOperator(const LeftRecord& left, const T& right) -> bool
	        {
	            bool result = true;
	            forEachLeafCoord<typename LeftRecord::AccessibleRecordDim>(
	                [&](auto leftRC) LLAMA_LAMBDA_INLINE {
	                    result &= Functor{}(
	                        left(leftRC),
	                        static_cast<std::remove_reference_t<decltype(left(leftRC))>>(right));
	                });
	            return result;
	        }

	        struct Assign
	        {
	            template<typename A, typename B>
	            LLAMA_FN_HOST_ACC_INLINE auto operator()(A&& a, const B& b) const -> decltype(auto)
	            {
	                return std::forward<A>(a) = b;
	            }
	        };

	        struct PlusAssign
	        {
	            template<typename A, typename B>
	            LLAMA_FN_HOST_ACC_INLINE auto operator()(A&& a, const B& b) const -> decltype(auto)
	            {
	                return std::forward<A>(a) += b;
	            }
	        };

	        struct MinusAssign
	        {
	            template<typename A, typename B>
	            LLAMA_FN_HOST_ACC_INLINE auto operator()(A&& a, const B& b) const -> decltype(auto)
	            {
	                return std::forward<A>(a) -= b;
	            }
	        };

	        struct MultiplyAssign
	        {
	            template<typename A, typename B>
	            LLAMA_FN_HOST_ACC_INLINE auto operator()(A&& a, const B& b) const -> decltype(auto)
	            {
	                return std::forward<A>(a) *= b;
	            }
	        };

	        struct DivideAssign
	        {
	            template<typename A, typename B>
	            LLAMA_FN_HOST_ACC_INLINE auto operator()(A&& a, const B& b) const -> decltype(auto)
	            {
	                return std::forward<A>(a) /= b;
	            }
	        };

	        struct ModuloAssign
	        {
	            template<typename A, typename B>
	            LLAMA_FN_HOST_ACC_INLINE auto operator()(A&& a, const B& b) const -> decltype(auto)
	            {
	                return std::forward<A>(a) %= b;
	            }
	        };

	        template<typename TWithOptionalConst, typename T>
	        LLAMA_FN_HOST_ACC_INLINE auto asTupleImpl(TWithOptionalConst& leaf, T) -> std::enable_if_t<
	            !is_VirtualRecord<std::decay_t<TWithOptionalConst>>,
	            std::reference_wrapper<TWithOptionalConst>>
	        {
	            return leaf;
	        }

	        template<typename VirtualRecord, typename T, std::size_t N, std::size_t... Is>
	        LLAMA_FN_HOST_ACC_INLINE auto asTupleImplArr(VirtualRecord&& vd, T(&&)[N], std::index_sequence<Is...>)
	        {
	            return std::make_tuple(asTupleImpl(vd(RecordCoord<Is>{}), T{})...);
	        }

	        template<typename VirtualRecord, typename T, std::size_t N>
	        LLAMA_FN_HOST_ACC_INLINE auto asTupleImpl(VirtualRecord&& vd, T(&&a)[N])
	        {
	            return asTupleImplArr(std::forward<VirtualRecord>(vd), std::move(a), std::make_index_sequence<N>{});
	        }

	        template<typename VirtualRecord, typename... Fields>
	        LLAMA_FN_HOST_ACC_INLINE auto asTupleImpl(VirtualRecord&& vd, Record<Fields...>)
	        {
	            return std::make_tuple(asTupleImpl(vd(GetFieldTag<Fields>{}), GetFieldType<Fields>{})...);
	        }

	        template<typename TWithOptionalConst, typename T>
	        LLAMA_FN_HOST_ACC_INLINE auto asFlatTupleImpl(TWithOptionalConst& leaf, T)
	            -> std::enable_if_t<!is_VirtualRecord<std::decay_t<TWithOptionalConst>>, std::tuple<TWithOptionalConst&>>
	        {
	            return {leaf};
	        }

	        template<typename VirtualRecord, typename T, std::size_t N, std::size_t... Is>
	        LLAMA_FN_HOST_ACC_INLINE auto asFlatTupleImplArr(VirtualRecord&& vd, T(&&)[N], std::index_sequence<Is...>)
	        {
	            return std::tuple_cat(asFlatTupleImpl(vd(RecordCoord<Is>{}), T{})...);
	        }

	        template<typename VirtualRecord, typename T, std::size_t N>
	        LLAMA_FN_HOST_ACC_INLINE auto asFlatTupleImpl(VirtualRecord&& vd, T(&&a)[N])
	        {
	            return asFlatTupleImplArr(std::forward<VirtualRecord>(vd), std::move(a), std::make_index_sequence<N>{});
	        }

	        template<typename VirtualRecord, typename... Fields>
	        LLAMA_FN_HOST_ACC_INLINE auto asFlatTupleImpl(VirtualRecord&& vd, Record<Fields...>)
	        {
	            return std::tuple_cat(asFlatTupleImpl(vd(GetFieldTag<Fields>{}), GetFieldType<Fields>{})...);
	        }

	        template<typename T, typename = void>
	        constexpr inline auto isTupleLike = false;

	        // get<I>(t) and std::tuple_size<T> must be available
	        using std::get; // make sure a get<0>() can be found, so the compiler can compile the trait
	        template<typename T>
	        constexpr inline auto
	            isTupleLike<T, std::void_t<decltype(get<0>(std::declval<T>())), std::tuple_size<T>>> = true;

	        template<typename... Ts>
	        constexpr inline auto dependentFalse = false;

	        template<typename Tuple1, typename Tuple2, std::size_t... Is>
	        LLAMA_FN_HOST_ACC_INLINE void assignTuples(Tuple1&& dst, Tuple2&& src, std::index_sequence<Is...>);

	        template<typename T1, typename T2>
	        LLAMA_FN_HOST_ACC_INLINE void assignTupleElement(T1&& dst, T2&& src)
	        {
	            if constexpr(isTupleLike<std::decay_t<T1>> && isTupleLike<std::decay_t<T2>>)
	            {
	                static_assert(std::tuple_size_v<std::decay_t<T1>> == std::tuple_size_v<std::decay_t<T2>>);
	                assignTuples(dst, src, std::make_index_sequence<std::tuple_size_v<std::decay_t<T1>>>{});
	            }
	            else if constexpr(!isTupleLike<std::decay_t<T1>> && !isTupleLike<std::decay_t<T2>>)
	                std::forward<T1>(dst) = std::forward<T2>(src);
	            else
	                static_assert(
	                    dependentFalse<T1, T2>,
	                    "Elements to assign are not tuple/tuple or non-tuple/non-tuple.");
	        }

	        template<typename Tuple1, typename Tuple2, std::size_t... Is>
	        LLAMA_FN_HOST_ACC_INLINE void assignTuples(Tuple1&& dst, Tuple2&& src, std::index_sequence<Is...>)
	        {
	            static_assert(std::tuple_size_v<std::decay_t<Tuple1>> == std::tuple_size_v<std::decay_t<Tuple2>>);
	            using std::get;
	            (assignTupleElement(get<Is>(std::forward<Tuple1>(dst)), get<Is>(std::forward<Tuple2>(src))), ...);
	        }

	        template<typename T, typename Tuple, std::size_t... Is>
	        LLAMA_FN_HOST_ACC_INLINE auto makeFromTuple(Tuple&& src, std::index_sequence<Is...>)
	        {
	            using std::get;
	            return T{get<Is>(std::forward<Tuple>(src))...};
	        }

	        template<typename T, typename SFINAE, typename... Args>
	        constexpr inline auto isDirectListInitializableImpl = false;

	        template<typename T, typename... Args>
	        constexpr inline auto
	            isDirectListInitializableImpl<T, std::void_t<decltype(T{std::declval<Args>()...})>, Args...> = true;

	        template<typename T, typename... Args>
	        constexpr inline auto isDirectListInitializable = isDirectListInitializableImpl<T, void, Args...>;

	        template<typename T, typename Tuple>
	        constexpr inline auto isDirectListInitializableFromTuple = false;

	        template<typename T, template<typename...> typename Tuple, typename... Args>
	        constexpr inline auto
	            isDirectListInitializableFromTuple<T, Tuple<Args...>> = isDirectListInitializable<T, Args...>;
	    } // namespace internal

	    /// Virtual record type returned by \ref View after resolving an array dimensions coordinate or partially resolving
	    /// a \ref RecordCoord. A virtual record does not hold data itself (thus named "virtual"), it just binds enough
	    /// information (array dimensions coord and partial record coord) to retrieve it from a \ref View later. Virtual
	    /// records should not be created by the user. They are returned from various access functions in \ref View and
	    /// VirtualRecord itself.
	    template<typename TView, typename TBoundRecordCoord, bool OwnView>
	    struct VirtualRecord : private TView::Mapping::ArrayIndex
	    {
	        using View = TView; ///< View this virtual record points into.
	        using BoundRecordCoord
	            = TBoundRecordCoord; ///< Record coords into View::RecordDim which are already bound by this VirtualRecord.

	    private:
	        using ArrayIndex = typename View::Mapping::ArrayIndex;
	        using RecordDim = typename View::Mapping::RecordDim;

	        std::conditional_t<OwnView, View, View&> view;

	    public:
	        /// Subtree of the record dimension of View starting at BoundRecordCoord. If BoundRecordCoord is
	        /// `RecordCoord<>` (default) AccessibleRecordDim is the same as `Mapping::RecordDim`.
	        using AccessibleRecordDim = GetType<RecordDim, BoundRecordCoord>;

	        /// Creates an empty VirtualRecord. Only available for if the view is owned. Used by llama::One.
	        LLAMA_FN_HOST_ACC_INLINE VirtualRecord()
	            /* requires(OwnView) */
	            : ArrayIndex{}
	            , view{allocViewStack<0, RecordDim>()}
	        {
	            static_assert(OwnView, "The default constructor of VirtualRecord is only available if it owns the view.");
	        }

	        LLAMA_FN_HOST_ACC_INLINE
	        VirtualRecord(ArrayIndex ai, std::conditional_t<OwnView, View&&, View&> view)
	            : ArrayIndex{ai}
	            , view{static_cast<decltype(view)>(view)}
	        {
	        }

	        VirtualRecord(const VirtualRecord&) = default;

	        // NOLINTNEXTLINE(cert-oop54-cpp)
	        LLAMA_FN_HOST_ACC_INLINE auto operator=(const VirtualRecord& other) -> VirtualRecord&
	        {
	            // NOLINTNEXTLINE(cppcoreguidelines-c-copy-assignment-signature,misc-unconventional-assign-operator)
	            return this->operator=<VirtualRecord>(other);
	        }

	        VirtualRecord(VirtualRecord&&) noexcept = default;
	        auto operator=(VirtualRecord&&) noexcept -> VirtualRecord& = default;

	        ~VirtualRecord() = default;

	        LLAMA_FN_HOST_ACC_INLINE constexpr auto arrayIndex() const -> ArrayIndex
	        {
	            return *this;
	        }

	        /// Create a VirtuaRecord from a different VirtualRecord. Only available for if the view is owned. Used by
	        /// llama::One.
	        template<typename OtherView, typename OtherBoundRecordCoord, bool OtherOwnView>
	        // NOLINTNEXTLINE(google-explicit-constructor,hicpp-explicit-conversions)
	        LLAMA_FN_HOST_ACC_INLINE VirtualRecord(
	            const VirtualRecord<OtherView, OtherBoundRecordCoord, OtherOwnView>& virtualRecord)
	            /* requires(OwnView) */
	            : VirtualRecord()
	        {
	            static_assert(
	                OwnView,
	                "The copy constructor of VirtualRecord from a different VirtualRecord is only available if it owns "
	                "the "
	                "view.");
	            *this = virtualRecord;
	        }

	        // TODO(bgruber): unify with previous in C++20 and use explicit(cond)
	        /// Create a VirtuaRecord from a scalar. Only available for if the view is owned. Used by llama::One.
	        template<typename T, typename = std::enable_if_t<!is_VirtualRecord<T>>>
	        LLAMA_FN_HOST_ACC_INLINE explicit VirtualRecord(const T& scalar)
	            /* requires(OwnView) */
	            : VirtualRecord()
	        {
	            static_assert(
	                OwnView,
	                "The constructor of VirtualRecord from a scalar is only available if it owns the view.");
	            *this = scalar;
	        }

	        /// Access a record in the record dimension underneath the current virtual record using a \ref RecordCoord. If
	        /// the access resolves to a leaf, a reference to a variable inside the \ref View storage is returned,
	        /// otherwise another virtual record.
	        template<std::size_t... Coord>
	        LLAMA_FN_HOST_ACC_INLINE auto operator()(RecordCoord<Coord...> = {}) const -> decltype(auto)
	        {
	            using AbsolutCoord = Cat<BoundRecordCoord, RecordCoord<Coord...>>;
	            using AccessedType = GetType<RecordDim, AbsolutCoord>;
	            if constexpr(isRecord<AccessedType> || internal::IsBoundedArray<AccessedType>::value)
	            {
	                LLAMA_FORCE_INLINE_RECURSIVE
	                return VirtualRecord<const View, AbsolutCoord>{arrayIndex(), this->view};
	            }
	            else
	            {
	                LLAMA_FORCE_INLINE_RECURSIVE
	                return this->view.accessor(arrayIndex(), AbsolutCoord{});
	            }
	        }

	        // FIXME(bgruber): remove redundancy
	        template<std::size_t... Coord>
	        LLAMA_FN_HOST_ACC_INLINE auto operator()(RecordCoord<Coord...> = {}) -> decltype(auto)
	        {
	            using AbsolutCoord = Cat<BoundRecordCoord, RecordCoord<Coord...>>;
	            using AccessedType = GetType<RecordDim, AbsolutCoord>;
	            if constexpr(isRecord<AccessedType> || internal::IsBoundedArray<AccessedType>::value)
	            {
	                LLAMA_FORCE_INLINE_RECURSIVE
	                return VirtualRecord<View, AbsolutCoord>{arrayIndex(), this->view};
	            }
	            else
	            {
	                LLAMA_FORCE_INLINE_RECURSIVE
	                return this->view.accessor(arrayIndex(), AbsolutCoord{});
	            }
	        }

	        /// Access a record in the record dimension underneath the current virtual record using a series of tags. If
	        /// the access resolves to a leaf, a reference to a variable inside the \ref View storage is returned,
	        /// otherwise another virtual record.
	        template<typename... Tags>
	        LLAMA_FN_HOST_ACC_INLINE auto operator()(Tags...) const -> decltype(auto)
	        {
	            using RecordCoord = GetCoordFromTags<AccessibleRecordDim, Tags...>;

	            LLAMA_FORCE_INLINE_RECURSIVE
	            return operator()(RecordCoord{});
	        }

	        // FIXME(bgruber): remove redundancy
	        template<typename... Tags>
	        LLAMA_FN_HOST_ACC_INLINE auto operator()(Tags...) -> decltype(auto)
	        {
	            using RecordCoord = GetCoordFromTags<AccessibleRecordDim, Tags...>;

	            LLAMA_FORCE_INLINE_RECURSIVE
	            return operator()(RecordCoord{});
	        }

	        template<typename T>
	        LLAMA_FN_HOST_ACC_INLINE auto operator=(const T& other) -> VirtualRecord&
	        {
	            // NOLINTNEXTLINE(cppcoreguidelines-c-copy-assignment-signature,misc-unconventional-assign-operator)
	            return internal::virtualRecordArithOperator<internal::Assign>(*this, other);
	        }

	        template<typename T>
	        LLAMA_FN_HOST_ACC_INLINE auto operator+=(const T& other) -> VirtualRecord&
	        {
	            return internal::virtualRecordArithOperator<internal::PlusAssign>(*this, other);
	        }

	        template<typename T>
	        LLAMA_FN_HOST_ACC_INLINE auto operator-=(const T& other) -> VirtualRecord&
	        {
	            return internal::virtualRecordArithOperator<internal::MinusAssign>(*this, other);
	        }

	        template<typename T>
	        LLAMA_FN_HOST_ACC_INLINE auto operator*=(const T& other) -> VirtualRecord&
	        {
	            return internal::virtualRecordArithOperator<internal::MultiplyAssign>(*this, other);
	        }

	        template<typename T>
	        LLAMA_FN_HOST_ACC_INLINE auto operator/=(const T& other) -> VirtualRecord&
	        {
	            return internal::virtualRecordArithOperator<internal::DivideAssign>(*this, other);
	        }

	        template<typename T>
	        LLAMA_FN_HOST_ACC_INLINE auto operator%=(const T& other) -> VirtualRecord&
	        {
	            return internal::virtualRecordArithOperator<internal::ModuloAssign>(*this, other);
	        }

	        template<typename T>
	        LLAMA_FN_HOST_ACC_INLINE friend auto operator+(const VirtualRecord& vd, const T& t)
	        {
	            return copyVirtualRecordStack(vd) += t;
	        }

	        template<typename T, typename = std::enable_if_t<!is_VirtualRecord<T>>>
	        LLAMA_FN_HOST_ACC_INLINE friend auto operator+(const T& t, const VirtualRecord& vd)
	        {
	            return vd + t;
	        }

	        template<typename T>
	        LLAMA_FN_HOST_ACC_INLINE friend auto operator-(const VirtualRecord& vd, const T& t)
	        {
	            return copyVirtualRecordStack(vd) -= t;
	        }

	        template<typename T>
	        LLAMA_FN_HOST_ACC_INLINE friend auto operator*(const VirtualRecord& vd, const T& t)
	        {
	            return copyVirtualRecordStack(vd) *= t;
	        }

	        template<typename T, typename = std::enable_if_t<!is_VirtualRecord<T>>>
	        LLAMA_FN_HOST_ACC_INLINE friend auto operator*(const T& t, const VirtualRecord& vd)
	        {
	            return vd * t;
	        }

	        template<typename T>
	        LLAMA_FN_HOST_ACC_INLINE friend auto operator/(const VirtualRecord& vd, const T& t)
	        {
	            return copyVirtualRecordStack(vd) /= t;
	        }

	        template<typename T>
	        LLAMA_FN_HOST_ACC_INLINE friend auto operator%(const VirtualRecord& vd, const T& t)
	        {
	            return copyVirtualRecordStack(vd) %= t;
	        }

	        template<typename T>
	        LLAMA_FN_HOST_ACC_INLINE friend auto operator==(const VirtualRecord& vd, const T& t) -> bool
	        {
	            return internal::virtualRecordRelOperator<std::equal_to<>>(vd, t);
	        }

	        template<typename T, typename = std::enable_if_t<!is_VirtualRecord<T>>>
	        LLAMA_FN_HOST_ACC_INLINE friend auto operator==(const T& t, const VirtualRecord& vd) -> bool
	        {
	            return vd == t;
	        }

	        template<typename T>
	        LLAMA_FN_HOST_ACC_INLINE friend auto operator!=(const VirtualRecord& vd, const T& t) -> bool
	        {
	            return internal::virtualRecordRelOperator<std::not_equal_to<>>(vd, t);
	        }

	        template<typename T, typename = std::enable_if_t<!is_VirtualRecord<T>>>
	        LLAMA_FN_HOST_ACC_INLINE friend auto operator!=(const T& t, const VirtualRecord& vd) -> bool
	        {
	            return vd != t;
	        }

	        template<typename T>
	        LLAMA_FN_HOST_ACC_INLINE friend auto operator<(const VirtualRecord& vd, const T& t) -> bool
	        {
	            return internal::virtualRecordRelOperator<std::less<>>(vd, t);
	        }

	        template<typename T, typename = std::enable_if_t<!is_VirtualRecord<T>>>
	        LLAMA_FN_HOST_ACC_INLINE friend auto operator<(const T& t, const VirtualRecord& vd) -> bool
	        {
	            return vd > t;
	        }

	        template<typename T>
	        LLAMA_FN_HOST_ACC_INLINE friend auto operator<=(const VirtualRecord& vd, const T& t) -> bool
	        {
	            return internal::virtualRecordRelOperator<std::less_equal<>>(vd, t);
	        }

	        template<typename T, typename = std::enable_if_t<!is_VirtualRecord<T>>>
	        LLAMA_FN_HOST_ACC_INLINE friend auto operator<=(const T& t, const VirtualRecord& vd) -> bool
	        {
	            return vd >= t;
	        }

	        template<typename T>
	        LLAMA_FN_HOST_ACC_INLINE friend auto operator>(const VirtualRecord& vd, const T& t) -> bool
	        {
	            return internal::virtualRecordRelOperator<std::greater<>>(vd, t);
	        }

	        template<typename T, typename = std::enable_if_t<!is_VirtualRecord<T>>>
	        LLAMA_FN_HOST_ACC_INLINE friend auto operator>(const T& t, const VirtualRecord& vd) -> bool
	        {
	            return vd < t;
	        }

	        template<typename T>
	        LLAMA_FN_HOST_ACC_INLINE friend auto operator>=(const VirtualRecord& vd, const T& t) -> bool
	        {
	            return internal::virtualRecordRelOperator<std::greater_equal<>>(vd, t);
	        }

	        template<typename T, typename = std::enable_if_t<!is_VirtualRecord<T>>>
	        LLAMA_FN_HOST_ACC_INLINE friend auto operator>=(const T& t, const VirtualRecord& vd) -> bool
	        {
	            return vd <= t;
	        }

	        LLAMA_FN_HOST_ACC_INLINE auto asTuple()
	        {
	            return internal::asTupleImpl(*this, AccessibleRecordDim{});
	        }

	        LLAMA_FN_HOST_ACC_INLINE auto asTuple() const
	        {
	            return internal::asTupleImpl(*this, AccessibleRecordDim{});
	        }

	        LLAMA_FN_HOST_ACC_INLINE auto asFlatTuple()
	        {
	            return internal::asFlatTupleImpl(*this, AccessibleRecordDim{});
	        }

	        LLAMA_FN_HOST_ACC_INLINE auto asFlatTuple() const
	        {
	            return internal::asFlatTupleImpl(*this, AccessibleRecordDim{});
	        }

	        template<std::size_t I>
	        LLAMA_FN_HOST_ACC_INLINE auto get() -> decltype(auto)
	        {
	            return operator()(RecordCoord<I>{});
	        }

	        template<std::size_t I>
	        LLAMA_FN_HOST_ACC_INLINE auto get() const -> decltype(auto)
	        {
	            return operator()(RecordCoord<I>{});
	        }

	        template<typename TupleLike>
	        LLAMA_FN_HOST_ACC_INLINE auto loadAs() -> TupleLike
	        {
	            static_assert(
	                internal::isDirectListInitializableFromTuple<TupleLike, decltype(asFlatTuple())>,
	                "TupleLike must be constructible from as many values as this VirtualRecord recursively represents "
	                "like "
	                "this: TupleLike{values...}");
	            return internal::makeFromTuple<TupleLike>(
	                asFlatTuple(),
	                std::make_index_sequence<std::tuple_size_v<decltype(asFlatTuple())>>{});
	        }

	        template<typename TupleLike>
	        LLAMA_FN_HOST_ACC_INLINE auto loadAs() const -> TupleLike
	        {
	            static_assert(
	                internal::isDirectListInitializableFromTuple<TupleLike, decltype(asFlatTuple())>,
	                "TupleLike must be constructible from as many values as this VirtualRecord recursively represents "
	                "like "
	                "this: TupleLike{values...}");
	            return internal::makeFromTuple<TupleLike>(
	                asFlatTuple(),
	                std::make_index_sequence<std::tuple_size_v<decltype(asFlatTuple())>>{});
	        }

	        struct Loader
	        {
	            VirtualRecord& vd;

	            template<typename T>
	            // NOLINTNEXTLINE(google-explicit-constructor,hicpp-explicit-conversions)
	            LLAMA_FN_HOST_ACC_INLINE operator T()
	            {
	                return vd.loadAs<T>();
	            }
	        };

	        struct LoaderConst
	        {
	            const VirtualRecord& vd;

	            template<typename T>
	            // NOLINTNEXTLINE(google-explicit-constructor,hicpp-explicit-conversions)
	            LLAMA_FN_HOST_ACC_INLINE operator T() const
	            {
	                return vd.loadAs<T>();
	            }
	        };

	        LLAMA_FN_HOST_ACC_INLINE auto load() -> Loader
	        {
	            return {*this};
	        }

	        LLAMA_FN_HOST_ACC_INLINE auto load() const -> LoaderConst
	        {
	            return {*this};
	        }

	        template<typename TupleLike>
	        LLAMA_FN_HOST_ACC_INLINE void store(const TupleLike& t)
	        {
	            internal::assignTuples(asTuple(), t, std::make_index_sequence<std::tuple_size_v<TupleLike>>{});
	        }

	        // swap for equal VirtualRecord
	        LLAMA_FN_HOST_ACC_INLINE friend void swap(
	            std::conditional_t<OwnView, VirtualRecord&, VirtualRecord> a,
	            std::conditional_t<OwnView, VirtualRecord&, VirtualRecord> b) noexcept
	        {
	            forEachLeafCoord<AccessibleRecordDim>(
	                [&](auto rc) LLAMA_LAMBDA_INLINE
	                {
	                    using std::swap;
	                    swap(a(rc), b(rc));
	                });
	        }
	    };

	    // swap for heterogeneous VirtualRecord
	    template<
	        typename ViewA,
	        typename BoundRecordDimA,
	        bool OwnViewA,
	        typename ViewB,
	        typename BoundRecordDimB,
	        bool OwnViewB>
	    LLAMA_FN_HOST_ACC_INLINE auto swap(
	        VirtualRecord<ViewA, BoundRecordDimA, OwnViewA>& a,
	        VirtualRecord<ViewB, BoundRecordDimB, OwnViewB>& b) noexcept
	        -> std::enable_if_t<std::is_same_v<
	            typename VirtualRecord<ViewA, BoundRecordDimA, OwnViewA>::AccessibleRecordDim,
	            typename VirtualRecord<ViewB, BoundRecordDimB, OwnViewB>::AccessibleRecordDim>>
	    {
	        using LeftRecord = VirtualRecord<ViewA, BoundRecordDimA, OwnViewA>;
	        forEachLeafCoord<typename LeftRecord::AccessibleRecordDim>(
	            [&](auto rc) LLAMA_LAMBDA_INLINE
	            {
	                using std::swap;
	                swap(a(rc), b(rc));
	            });
	    }

	    template<typename View, typename BoundRecordCoord, bool OwnView>
	    auto operator<<(std::ostream& os, const VirtualRecord<View, BoundRecordCoord, OwnView>& vr) -> std::ostream&
	    {
	        using RecordDim = typename VirtualRecord<View, BoundRecordCoord, OwnView>::AccessibleRecordDim;
	        os << "{";
	        // TODO(bgruber): I tried refactoring both branches into one, but MSVC and icpc have troubles with correctly
	        // discarding the discarded if constexpr branch and not instantiating templates inside them.
	        if constexpr(std::is_array_v<RecordDim>)
	        {
	            constexpr auto size = std::extent_v<RecordDim>;
	            boost::mp11::mp_for_each<boost::mp11::mp_iota_c<size>>(
	                [&](auto ic)
	                {
	                    constexpr std::size_t i = decltype(ic)::value;
	                    os << '[' << i << ']' << ": " << vr(RecordCoord<i>{});
	                    if(i + 1 < size)
	                        os << ", ";
	                });
	        }
	        else
	        {
	            constexpr auto size = boost::mp11::mp_size<RecordDim>::value;
	            boost::mp11::mp_for_each<boost::mp11::mp_iota_c<size>>(
	                [&](auto ic)
	                {
	                    constexpr std::size_t i = decltype(ic)::value;
	                    using Field = boost::mp11::mp_at_c<RecordDim, i>;
	                    using Tag = GetFieldTag<Field>;
	                    os << structName<Tag>() << ": " << vr(RecordCoord<i>{});
	                    if(i + 1 < size)
	                        os << ", ";
	                });
	        }
	        os << "}";
	        return os;
	    }

	    template<typename VirtualRecordFwd, typename Functor>
	    LLAMA_FN_HOST_ACC_INLINE constexpr void forEachLeaf(VirtualRecordFwd&& vr, Functor&& functor)
	    {
	        using VirtualRecord = std::remove_reference_t<VirtualRecordFwd>;
	        LLAMA_FORCE_INLINE_RECURSIVE
	        forEachLeafCoord<typename VirtualRecord::AccessibleRecordDim>(
	            [functor = std::forward<Functor>(functor), &vr = vr](auto rc)
	                LLAMA_LAMBDA_INLINE_WITH_SPECIFIERS(constexpr mutable) { std::forward<Functor>(functor)(vr(rc)); });
	    }
	} // namespace llama

	template<typename View, typename BoundRecordCoord, bool OwnView>
	struct std::tuple_size<llama::VirtualRecord<View, BoundRecordCoord, OwnView>>
	    : boost::mp11::mp_size<typename llama::VirtualRecord<View, BoundRecordCoord, OwnView>::AccessibleRecordDim>
	{
	};

	template<std::size_t I, typename View, typename BoundRecordCoord, bool OwnView>
	struct std::tuple_element<I, llama::VirtualRecord<View, BoundRecordCoord, OwnView>>
	{
	    using type = decltype(std::declval<llama::VirtualRecord<View, BoundRecordCoord, OwnView>>().template get<I>());
	};

	template<std::size_t I, typename View, typename BoundRecordCoord, bool OwnView>
	struct std::tuple_element<I, const llama::VirtualRecord<View, BoundRecordCoord, OwnView>>
	{
	    using type
	        = decltype(std::declval<const llama::VirtualRecord<View, BoundRecordCoord, OwnView>>().template get<I>());
	};

	#if CAN_USE_RANGES
	template<
	    typename ViewA,
	    typename BoundA,
	    bool OwnA,
	    typename ViewB,
	    typename BoundB,
	    bool OwnB,
	    template<class>
	    class TQual,
	    template<class>
	    class UQual>
	struct std::basic_common_reference<
	    llama::VirtualRecord<ViewA, BoundA, OwnA>,
	    llama::VirtualRecord<ViewB, BoundB, OwnB>,
	    TQual,
	    UQual>
	{
	    using type = std::enable_if_t<
	        std::is_same_v<
	            typename llama::VirtualRecord<ViewA, BoundA, OwnA>::AccessibleRecordDim,
	            typename llama::VirtualRecord<ViewB, BoundB, OwnB>::AccessibleRecordDim>,
	        llama::One<typename ViewA::RecordDim>>;
	};
	#endif
	// ==
	// == ./VirtualRecord.hpp ==
	// ============================================================================


// #include <algorithm>    // amalgamate: file already included
#include <stdexcept>
// #include <string>    // amalgamate: file already included

namespace llama
{
    // TODO(bgruber): expose blob allocator
    /// An equivalent of std::vector<T> backed by a \ref View. Elements are never value initialized though. No strong
    /// exception guarantee.
    /// WARNING: This class is experimental.
    /// @tparam Mapping The mapping to be used for the underlying view. Needs to have 1 array dimension.
    template<typename Mapping>
    struct Vector
    {
        static_assert(Mapping::ArrayExtents::rank == 1, "llama::Vector only supports 1D mappings");

        using ViewType = decltype(allocViewUninitialized<Mapping>());
        using RecordDim = typename Mapping::RecordDim;

        using iterator = decltype(std::declval<ViewType>().begin());
        using value_type = typename iterator::value_type;

        Vector() = default;

        template<typename VirtualRecord = One<RecordDim>>
        LLAMA_FN_HOST_ACC_INLINE explicit Vector(std::size_t count, const VirtualRecord& value = {})
        {
            reserve(count);
            for(std::size_t i = 0; i < count; i++)
                push_back(value);
        }

        template<typename Iterator>
        LLAMA_FN_HOST_ACC_INLINE Vector(Iterator first, Iterator last)
        {
            if constexpr(std::is_same_v<
                             typename std::iterator_traits<Iterator>::iterator_category,
                             std::random_access_iterator_tag>)
                reserve(std::distance(first, last));
            for(; first != last; ++first)
                push_back(*first);
        }

        Vector(const Vector& other) = default;

        LLAMA_FN_HOST_ACC_INLINE Vector(Vector&& other) noexcept
        {
            swap(other);
        }

        auto operator=(const Vector& other) -> Vector& = default;

        LLAMA_FN_HOST_ACC_INLINE auto operator=(Vector&& other) noexcept -> Vector&
        {
            swap(other);
            return *this;
        }

        ~Vector() = default;

        // TODO(bgruber): assign

        LLAMA_FN_HOST_ACC_INLINE auto at(std::size_t i) -> decltype(auto)
        {
            if(i >= m_size)
                throw std::out_of_range{
                    "Index " + std::to_string(i) + "out of range [0:" + std::to_string(m_size) + "["};
            return m_view(i);
        }

        LLAMA_FN_HOST_ACC_INLINE auto at(std::size_t i) const -> decltype(auto)
        {
            if(i >= m_size)
                throw std::out_of_range{
                    "Index " + std::to_string(i) + "out of range [0:" + std::to_string(m_size) + "["};
            return m_view(i);
        }

        LLAMA_FN_HOST_ACC_INLINE auto operator[](std::size_t i) -> decltype(auto)
        {
            return m_view(i);
        }

        LLAMA_FN_HOST_ACC_INLINE auto operator[](std::size_t i) const -> decltype(auto)
        {
            return m_view(i);
        }

        LLAMA_FN_HOST_ACC_INLINE auto front() -> decltype(auto)
        {
            return m_view(0);
        }

        LLAMA_FN_HOST_ACC_INLINE auto front() const -> decltype(auto)
        {
            return m_view(0);
        }

        LLAMA_FN_HOST_ACC_INLINE auto back() -> decltype(auto)
        {
            return m_view(m_size - 1);
        }

        LLAMA_FN_HOST_ACC_INLINE auto back() const -> decltype(auto)
        {
            return m_view(m_size - 1);
        }

        LLAMA_FN_HOST_ACC_INLINE auto begin() -> decltype(auto)
        {
            return m_view.begin();
        }

        LLAMA_FN_HOST_ACC_INLINE auto begin() const -> decltype(auto)
        {
            return m_view.begin();
        }

        LLAMA_FN_HOST_ACC_INLINE auto cbegin() -> decltype(auto)
        {
            return std::as_const(m_view).begin();
        }

        LLAMA_FN_HOST_ACC_INLINE auto cbegin() const -> decltype(auto)
        {
            return m_view.begin();
        }

        LLAMA_FN_HOST_ACC_INLINE auto end() -> decltype(auto)
        {
            return m_view.begin() + m_size;
        }

        LLAMA_FN_HOST_ACC_INLINE auto end() const -> decltype(auto)
        {
            return m_view.begin() + m_size;
        }

        LLAMA_FN_HOST_ACC_INLINE auto cend() -> decltype(auto)
        {
            return std::as_const(m_view).begin() + m_size;
        }

        LLAMA_FN_HOST_ACC_INLINE auto cend() const -> decltype(auto)
        {
            return m_view.begin() + m_size;
        }

        LLAMA_FN_HOST_ACC_INLINE auto empty() const -> bool
        {
            return m_size == 0;
        }

        LLAMA_FN_HOST_ACC_INLINE auto size() const -> std::size_t
        {
            return m_size;
        }

        LLAMA_FN_HOST_ACC_INLINE void reserve(std::size_t cap)
        {
            if(cap > capacity())
                changeCapacity(cap);
        }

        LLAMA_FN_HOST_ACC_INLINE auto capacity() const -> std::size_t
        {
            return m_view.mapping().extents()[0];
        }

        LLAMA_FN_HOST_ACC_INLINE void shrink_to_fit()
        {
            changeCapacity(m_size);
        }

        LLAMA_FN_HOST_ACC_INLINE void clear()
        {
            m_size = 0;
        }

        template<typename T>
        LLAMA_FN_HOST_ACC_INLINE auto insert(iterator pos, T&& t) -> iterator
        {
            const auto i = pos - begin();
            reserve(m_size + 1); // might invalidate pos
            pos = begin() + i;
            std::copy_backward(pos, end(), end() + 1);
            m_view[i] = std::forward<T>(t);
            m_size++;
            return pos;
        }

        // TODO(bgruber): more insert overloads

        // TODO(bgruber): emplace

        LLAMA_FN_HOST_ACC_INLINE auto erase(iterator pos) -> iterator
        {
            std::copy(pos + 1, end(), pos);
            m_size--;
            return pos;
        }

        // TODO(bgruber): more erase overloads

        // TODO(bgruber): T here is probably a virtual record. We could also allow any struct that is storable to the
        // view via VirtualRecord::store().
        template<typename T>
        LLAMA_FN_HOST_ACC_INLINE void push_back(T&& t)
        {
            if(const auto cap = capacity(); m_size == cap)
                reserve(std::max(cap + cap / 2, m_size + 1));

            m_view[m_size++] = std::forward<T>(t);
        }

        // TODO(bgruber): emplace_back

        LLAMA_FN_HOST_ACC_INLINE void pop_back()
        {
            m_size--;
        }

        template<typename VirtualRecord = One<RecordDim>>
        LLAMA_FN_HOST_ACC_INLINE void resize(std::size_t count, const VirtualRecord& value = {})
        {
            reserve(count);
            for(std::size_t i = m_size; i < count; i++)
                m_view[i] = value;
            m_size = count;
        }

        LLAMA_FN_HOST_ACC_INLINE friend auto operator==(const Vector& a, const Vector& b) -> bool
        {
            if(a.m_size != b.m_size)
                return false;
            return std::equal(a.begin(), a.end(), b.begin());
        }

        LLAMA_FN_HOST_ACC_INLINE friend auto operator!=(const Vector& a, const Vector& b) -> bool
        {
            return !(a == b);
        }

        LLAMA_FN_HOST_ACC_INLINE friend auto operator<(const Vector& a, const Vector& b) -> bool
        {
            return std::lexicographical_compare(a.begin(), a.end(), b.begin(), b.end());
        }

        LLAMA_FN_HOST_ACC_INLINE friend auto operator<=(const Vector& a, const Vector& b) -> bool
        {
            return !(b < a);
        }

        LLAMA_FN_HOST_ACC_INLINE friend auto operator>(const Vector& a, const Vector& b) -> bool
        {
            return b < a;
        }

        LLAMA_FN_HOST_ACC_INLINE friend auto operator>=(const Vector& a, const Vector& b) -> bool
        {
            return !(a < b);
        }

        LLAMA_FN_HOST_ACC_INLINE friend void swap(Vector& a, Vector& b) noexcept
        {
            a.swap(b);
        }

    private:
        LLAMA_FN_HOST_ACC_INLINE void changeCapacity(std::size_t cap)
        {
            auto newView = allocViewUninitialized<Mapping>(Mapping{typename Mapping::ArrayExtents{cap}});
            auto b = begin();
            std::copy(begin(), b + std::min(m_size, cap), newView.begin());
            using std::swap;
            swap(m_view, newView); // depends on move semantic of View
        }

        LLAMA_FN_HOST_ACC_INLINE void swap(Vector& other) noexcept
        {
            using std::swap;
            swap(m_view, other.m_view); // depends on move semantic of View
            swap(m_size, other.m_size);
        }

        ViewType m_view = {};
        std::size_t m_size = 0;
    };


} // namespace llama
// ==
// == ./Vector.hpp ==
// ============================================================================

// ============================================================================
// == ./Copy.hpp ==
// ==
// SPDX-License-Identifier: GPL-3.0-or-later

// #pragma once
// #include "View.hpp"    // amalgamate: file already expanded
	// ============================================================================
	// == ./mapping/AoSoA.hpp ==
	// ==
	// SPDX-License-Identifier: GPL-3.0-or-later

	// #pragma once
	// #include "Common.hpp"    // amalgamate: file already expanded

	// #include <limits>    // amalgamate: file already included

	namespace llama::mapping
	{
	    /// The maximum number of vector lanes that can be used to fetch each leaf type in the record dimension into a
	    /// vector register of the given size in bits.
	    template<typename RecordDim, std::size_t VectorRegisterBits>
	    inline constexpr std::size_t maxLanes = []() constexpr
	    {
	        auto max = std::numeric_limits<std::size_t>::max();
	        forEachLeafCoord<RecordDim>(
	            [&](auto rc)
	            {
	                using AttributeType = GetType<RecordDim, decltype(rc)>;
	                max = std::min(max, VectorRegisterBits / (sizeof(AttributeType) * CHAR_BIT));
	            });
	        return max;
	    }
	    ();

	    /// Array of struct of arrays mapping. Used to create a \ref View via \ref allocView.
	    /// \tparam Lanes The size of the inner arrays of this array of struct of arrays.
	    /// \tparam FlattenRecordDim Defines how the record dimension's fields should be flattened. See \ref
	    /// FlattenRecordDimInOrder, \ref FlattenRecordDimIncreasingAlignment, \ref FlattenRecordDimDecreasingAlignment and
	    /// \ref FlattenRecordDimMinimizePadding.
	    template<
	        typename TArrayExtents,
	        typename TRecordDim,
	        std::size_t Lanes,
	        typename TLinearizeArrayDimsFunctor = LinearizeArrayDimsCpp,
	        template<typename> typename FlattenRecordDim = FlattenRecordDimInOrder>
	    struct AoSoA : private TArrayExtents
	    {
	        using ArrayExtents = TArrayExtents;
	        using ArrayIndex = typename ArrayExtents::Index;
	        using RecordDim = TRecordDim;
	        using LinearizeArrayDimsFunctor = TLinearizeArrayDimsFunctor;
	        static constexpr std::size_t blobCount = 1;

	        constexpr AoSoA() = default;

	        LLAMA_FN_HOST_ACC_INLINE constexpr explicit AoSoA(ArrayExtents extents, RecordDim = {}) : ArrayExtents(extents)
	        {
	        }

	        LLAMA_FN_HOST_ACC_INLINE constexpr auto extents() const -> ArrayExtents
	        {
	            return ArrayExtents{*this};
	        }

	        LLAMA_FN_HOST_ACC_INLINE constexpr auto blobSize(std::size_t) const -> std::size_t
	        {
	            return roundUpToMultiple(
	                LinearizeArrayDimsFunctor{}.size(extents()) * sizeOf<RecordDim>,
	                Lanes * sizeOf<RecordDim>);
	        }

	        template<std::size_t... RecordCoords>
	        LLAMA_FN_HOST_ACC_INLINE constexpr auto blobNrAndOffset(ArrayIndex ai, RecordCoord<RecordCoords...> = {}) const
	            -> NrAndOffset
	        {
	            constexpr std::size_t flatFieldIndex =
	#ifdef __NVCC__
	                *& // mess with nvcc compiler state to workaround bug
	#endif
	                 Flattener::template flatIndex<RecordCoords...>;
	            const auto flatArrayIndex = LinearizeArrayDimsFunctor{}(ai, extents());
	            const auto blockIndex = flatArrayIndex / Lanes;
	            const auto laneIndex = flatArrayIndex % Lanes;
	            const auto offset = (sizeOf<RecordDim> * Lanes) * blockIndex
	                + flatOffsetOf<typename Flattener::FlatRecordDim, flatFieldIndex, false> * Lanes
	                + sizeof(GetType<RecordDim, RecordCoord<RecordCoords...>>) * laneIndex;
	            return {0, offset};
	        }

	    private:
	        using Flattener = FlattenRecordDim<TRecordDim>;
	    };

	    template<std::size_t Lanes, typename LinearizeArrayDimsFunctor = LinearizeArrayDimsCpp>
	    struct PreconfiguredAoSoA
	    {
	        template<typename ArrayExtents, typename RecordDim>
	        using type = AoSoA<ArrayExtents, RecordDim, Lanes, LinearizeArrayDimsFunctor>;
	    };

	    template<typename Mapping>
	    inline constexpr bool isAoSoA = false;

	    template<typename AD, typename RD, std::size_t L>
	    inline constexpr bool isAoSoA<AoSoA<AD, RD, L>> = true;

	} // namespace llama::mapping
	// ==
	// == ./mapping/AoSoA.hpp ==
	// ============================================================================

	// ============================================================================
	// == ./mapping/SoA.hpp ==
	// ==
	// Copyright 2018 Alexander Matthes
	// SPDX-License-Identifier: GPL-3.0-or-later

	// #pragma once
	// #include "Common.hpp"    // amalgamate: file already expanded

	// #include <limits>    // amalgamate: file already included

	namespace llama::mapping
	{
	    /// Struct of array mapping. Used to create a \ref View via \ref allocView.
	    /// \tparam SeparateBuffers If true, every element of the record dimension is mapped to its own buffer.
	    /// \tparam LinearizeArrayDimsFunctor Defines how the array dimensions should be mapped into linear numbers and
	    /// how big the linear domain gets.
	    /// \tparam FlattenRecordDim Defines how the record dimension's fields should be flattened if SeparateBuffers is
	    /// false. See \ref FlattenRecordDimInOrder, \ref FlattenRecordDimIncreasingAlignment, \ref
	    /// FlattenRecordDimDecreasingAlignment and \ref FlattenRecordDimMinimizePadding.
	    template<
	        typename TArrayExtents,
	        typename TRecordDim,
	        bool SeparateBuffers = true,
	        typename TLinearizeArrayDimsFunctor = LinearizeArrayDimsCpp,
	        template<typename> typename FlattenRecordDimSingleBlob = FlattenRecordDimInOrder>
	    struct SoA : private TArrayExtents
	    {
	        using ArrayExtents = TArrayExtents;
	        using ArrayIndex = typename ArrayExtents::Index;
	        using RecordDim = TRecordDim;
	        using LinearizeArrayDimsFunctor = TLinearizeArrayDimsFunctor;
	        static constexpr std::size_t blobCount
	            = SeparateBuffers ? boost::mp11::mp_size<FlatRecordDim<RecordDim>>::value : 1;

	        constexpr SoA() = default;

	        LLAMA_FN_HOST_ACC_INLINE
	        constexpr explicit SoA(ArrayExtents extents, RecordDim = {}) : ArrayExtents(extents)
	        {
	        }

	        LLAMA_FN_HOST_ACC_INLINE constexpr auto extents() const -> ArrayExtents
	        {
	            return ArrayExtents{*this};
	        }

	        LLAMA_FN_HOST_ACC_INLINE
	        constexpr auto blobSize([[maybe_unused]] std::size_t blobIndex) const -> std::size_t
	        {
	            if constexpr(SeparateBuffers)
	            {
	                constexpr Array<std::size_t, blobCount> typeSizes = []() constexpr
	                {
	                    Array<std::size_t, blobCount> r{};
	                    forEachLeafCoord<RecordDim>([&r, i = 0](auto rc) mutable constexpr
	                                                { r[i++] = sizeof(GetType<RecordDim, decltype(rc)>); });
	                    return r;
	                }
	                ();
	                return LinearizeArrayDimsFunctor{}.size(extents()) * typeSizes[blobIndex];
	            }
	            else
	            {
	                return LinearizeArrayDimsFunctor{}.size(extents()) * sizeOf<RecordDim>;
	            }
	        }

	        template<std::size_t... RecordCoords>
	        LLAMA_FN_HOST_ACC_INLINE constexpr auto blobNrAndOffset(ArrayIndex ad, RecordCoord<RecordCoords...> = {}) const
	            -> NrAndOffset
	        {
	            if constexpr(SeparateBuffers)
	            {
	                constexpr auto blob = flatRecordCoord<RecordDim, RecordCoord<RecordCoords...>>;
	                const auto offset = LinearizeArrayDimsFunctor{}(ad, extents())
	                    * sizeof(GetType<RecordDim, RecordCoord<RecordCoords...>>);
	                return {blob, offset};
	            }
	            else
	            {
	                constexpr std::size_t flatFieldIndex =
	#ifdef __NVCC__
	                    *& // mess with nvcc compiler state to workaround bug
	#endif
	                     Flattener::template flatIndex<RecordCoords...>;
	                const auto offset = LinearizeArrayDimsFunctor{}(ad, extents())
	                        * sizeof(GetType<RecordDim, RecordCoord<RecordCoords...>>)
	                    + flatOffsetOf<
	                          typename Flattener::FlatRecordDim,
	                          flatFieldIndex,
	                          false> * LinearizeArrayDimsFunctor{}.size(extents());
	                return {0, offset};
	            }
	        }

	    private:
	        using Flattener = FlattenRecordDimSingleBlob<TRecordDim>;
	    };

	    /// Struct of array mapping storing the entire layout in a single blob.
	    /// \see SoA
	    template<typename ArrayExtents, typename RecordDim, typename LinearizeArrayDimsFunctor = LinearizeArrayDimsCpp>
	    using SingleBlobSoA = SoA<ArrayExtents, RecordDim, false, LinearizeArrayDimsFunctor>;

	    /// Struct of array mapping storing each attribute of the record dimension in a separate blob.
	    /// \see SoA
	    template<typename ArrayExtents, typename RecordDim, typename LinearizeArrayDimsFunctor = LinearizeArrayDimsCpp>
	    using MultiBlobSoA = SoA<ArrayExtents, RecordDim, true, LinearizeArrayDimsFunctor>;

	    template<bool SeparateBuffers = true, typename LinearizeArrayDimsFunctor = LinearizeArrayDimsCpp>
	    struct PreconfiguredSoA
	    {
	        template<typename ArrayExtents, typename RecordDim>
	        using type = SoA<ArrayExtents, RecordDim, SeparateBuffers, LinearizeArrayDimsFunctor>;
	    };

	    template<typename Mapping>
	    inline constexpr bool isSoA = false;

	    template<typename ArrayExtents, typename RecordDim, bool SeparateBuffers, typename LinearizeArrayDimsFunctor>
	    inline constexpr bool isSoA<SoA<ArrayExtents, RecordDim, SeparateBuffers, LinearizeArrayDimsFunctor>> = true;
	} // namespace llama::mapping
	// ==
	// == ./mapping/SoA.hpp ==
	// ============================================================================


#include <cstring>
#include <numeric>

namespace llama
{
    namespace internal
    {
        template<typename RecordDim>
        void assertTrivialCopyable()
        {
            forEachLeafCoord<RecordDim>(
                [](auto rc)
                {
                    static_assert(
                        std::is_trivially_copyable_v<GetType<RecordDim, decltype(rc)>>,
                        "All types in the record dimension must be trivially copyable");
                });
        }

        using memcopyFunc = void* (*) (void*, const void*, std::size_t);

        inline void parallel_memcpy(
            std::byte* dst,
            const std::byte* src,
            std::size_t size,
            std::size_t threadId = 0,
            std::size_t threadCount = 1,
            memcopyFunc singleThreadMemcpy = std::memcpy)
        {
            const auto sizePerThread = size / threadCount;
            const auto sizeLastThread = sizePerThread + size % threadCount;
            const auto sizeThisThread = threadId == threadCount - 1 ? sizeLastThread : sizePerThread;
            singleThreadMemcpy(dst + threadId * sizePerThread, src + threadId * sizePerThread, sizeThisThread);
        }
    } // namespace internal

    /// Direct memcpy from source view blobs to destination view blobs. Both views need to have the same mappings with
    /// the same array dimensions.
    /// @param threadId Optional. Zero-based id of calling thread for multi-threaded invocations.
    /// @param threadCount Optional. Thread count in case of multi-threaded invocation.
    template<typename Mapping, typename SrcBlob, typename DstBlob>
    void blobMemcpy(
        const View<Mapping, SrcBlob>& srcView,
        View<Mapping, DstBlob>& dstView,
        std::size_t threadId = 0,
        std::size_t threadCount = 1)
    {
        internal::assertTrivialCopyable<typename Mapping::RecordDim>();

        // TODO(bgruber): we do not verify if the mappings have other runtime state than the array dimensions
        if(srcView.mapping().extents() != dstView.mapping().extents())
            throw std::runtime_error{"Array dimensions sizes are different"};

        // TODO(bgruber): this is maybe not the best parallel copying strategy
        for(std::size_t i = 0; i < Mapping::blobCount; i++)
            internal::parallel_memcpy(
                &dstView.storageBlobs[i][0],
                &srcView.storageBlobs[i][0],
                dstView.mapping().blobSize(i),
                threadId,
                threadCount);
    }

    /// Field-wise copy from source to destination view. Both views need to have the same array and record dimensions.
    /// @param threadId Optional. Thread id in case of multi-threaded copy.
    /// @param threadCount Optional. Thread count in case of multi-threaded copy.
    template<typename SrcMapping, typename SrcBlob, typename DstMapping, typename DstBlob>
    void fieldWiseCopy(
        const View<SrcMapping, SrcBlob>& srcView,
        View<DstMapping, DstBlob>& dstView,
        std::size_t threadId = 0,
        std::size_t threadCount = 1)
    {
        // TODO(bgruber): think if we can remove this restriction
        static_assert(
            std::is_same_v<typename SrcMapping::RecordDim, typename DstMapping::RecordDim>,
            "The source and destination record dimensions must be the same");

        if(srcView.mapping().extents() != dstView.mapping().extents())
            throw std::runtime_error{"Array dimensions sizes are different"};

        auto copyOne = [&](auto ai) LLAMA_LAMBDA_INLINE
        {
            forEachLeafCoord<typename DstMapping::RecordDim>([&](auto rc) LLAMA_LAMBDA_INLINE
                                                             { dstView(ai)(rc) = srcView(ai)(rc); });
        };

        constexpr auto dims = SrcMapping::ArrayExtents::rank;
        const auto extents = srcView.mapping().extents().toArray();
        const auto workPerThread = (extents[0] + threadCount - 1) / threadCount;
        const auto start = threadId * workPerThread;
        const auto end = std::min((threadId + 1) * workPerThread, extents[0]);
        for(auto i = start; i < end; i++)
        {
            if constexpr(dims > 1)
                forEachADCoord(ArrayIndex<dims - 1>{pop_front(extents)}, copyOne, static_cast<std::size_t>(i));
            else
                copyOne(ArrayIndex<dims>{static_cast<std::size_t>(i)});
        }
    }

    namespace internal
    {
        template<typename Mapping>
        inline constexpr std::size_t aosoaLanes = 0;

        template<typename ArrayExtents, typename RecordDim, bool SeparateBuffers, typename LinearizeArrayDimsFunctor>
        inline constexpr std::size_t aosoaLanes<
            mapping::SoA<ArrayExtents, RecordDim, SeparateBuffers, LinearizeArrayDimsFunctor>> = std::
            numeric_limits<std::size_t>::max();

        template<typename ArrayExtents, typename RecordDim, std::size_t Lanes, typename LinearizeArrayDimsFunctor>
        inline constexpr std::size_t
            aosoaLanes<mapping::AoSoA<ArrayExtents, RecordDim, Lanes, LinearizeArrayDimsFunctor>> = Lanes;
    } // namespace internal

    /// AoSoA copy strategy which transfers data in common blocks. SoA mappings are also allowed for at most 1
    /// argument.
    /// @param threadId Optional. Zero-based id of calling thread for multi-threaded invocations.
    /// @param threadCount Optional. Thread count in case of multi-threaded invocation.
    template<typename SrcMapping, typename SrcBlob, typename DstMapping, typename DstBlob>
    void aosoaCommonBlockCopy(
        const View<SrcMapping, SrcBlob>& srcView,
        View<DstMapping, DstBlob>& dstView,
        bool readOpt,
        std::size_t threadId = 0,
        std::size_t threadCount = 1)
    {
        // TODO(bgruber): think if we can remove this restriction
        static_assert(
            std::is_same_v<typename SrcMapping::RecordDim, typename DstMapping::RecordDim>,
            "The source and destination record dimensions must be the same");
        static_assert(
            std::is_same_v<
                typename SrcMapping::LinearizeArrayDimsFunctor,
                typename DstMapping::LinearizeArrayDimsFunctor>,
            "Source and destination mapping need to use the same array dimensions linearizer");
        using RecordDim = typename SrcMapping::RecordDim;
        internal::assertTrivialCopyable<RecordDim>();

        [[maybe_unused]] static constexpr bool MBSrc = SrcMapping::blobCount > 1;
        [[maybe_unused]] static constexpr bool MBDst = DstMapping::blobCount > 1;
        static constexpr auto LanesSrc = internal::aosoaLanes<SrcMapping>;
        static constexpr auto LanesDst = internal::aosoaLanes<DstMapping>;

        if(srcView.mapping().extents() != dstView.mapping().extents())
            throw std::runtime_error{"Array dimensions sizes are different"};

        static constexpr auto srcIsAoSoA = LanesSrc != std::numeric_limits<std::size_t>::max();
        static constexpr auto dstIsAoSoA = LanesDst != std::numeric_limits<std::size_t>::max();

        static_assert(srcIsAoSoA || dstIsAoSoA, "At least one of the mappings must be an AoSoA mapping");
        static_assert(
            !srcIsAoSoA || std::tuple_size_v<decltype(srcView.storageBlobs)> == 1,
            "Implementation assumes AoSoA with single blob");
        static_assert(
            !dstIsAoSoA || std::tuple_size_v<decltype(dstView.storageBlobs)> == 1,
            "Implementation assumes AoSoA with single blob");

        const auto flatSize = product(dstView.mapping().extents());

        // TODO(bgruber): implement the following by adding additional copy loops for the remaining elements
        if(!srcIsAoSoA && flatSize % LanesDst != 0)
            throw std::runtime_error{"Source SoA mapping's total array elements must be evenly divisible by the "
                                     "destination AoSoA Lane count."};
        if(!dstIsAoSoA && flatSize % LanesSrc != 0)
            throw std::runtime_error{"Destination SoA mapping's total array elements must be evenly divisible by the "
                                     "source AoSoA Lane count."};

        // the same as AoSoA::blobNrAndOffset but takes a flat array index
        auto mapAoSoA = [](std::size_t flatArrayIndex, auto rc, std::size_t Lanes) LLAMA_LAMBDA_INLINE
        {
            const auto blockIndex = flatArrayIndex / Lanes;
            const auto laneIndex = flatArrayIndex % Lanes;
            const auto offset = (sizeOf<RecordDim> * Lanes) * blockIndex + offsetOf<RecordDim, decltype(rc)> * Lanes
                + sizeof(GetType<RecordDim, decltype(rc)>) * laneIndex;
            return offset;
        };
        // the same as SoA::blobNrAndOffset but takes a flat array index
        auto mapSoA = [&](std::size_t flatArrayIndex, auto rc, bool mb) LLAMA_LAMBDA_INLINE
        {
            const auto blob = mb * flatRecordCoord<RecordDim, decltype(rc)>;
            const auto offset = !mb * offsetOf<RecordDim, decltype(rc)> * flatSize
                + sizeof(GetType<RecordDim, decltype(rc)>) * flatArrayIndex;
            return NrAndOffset{blob, offset};
        };

        auto mapSrc = [&](std::size_t flatArrayIndex, auto rc) LLAMA_LAMBDA_INLINE
        {
            if constexpr(srcIsAoSoA)
                return &srcView.storageBlobs[0][0] + mapAoSoA(flatArrayIndex, rc, LanesSrc);
            else
            {
                const auto [blob, off] = mapSoA(flatArrayIndex, rc, MBSrc);
                return &srcView.storageBlobs[blob][off];
            }
        };
        auto mapDst = [&](std::size_t flatArrayIndex, auto rc) LLAMA_LAMBDA_INLINE
        {
            if constexpr(dstIsAoSoA)
                return &dstView.storageBlobs[0][0] + mapAoSoA(flatArrayIndex, rc, LanesDst);
            else
            {
                const auto [blob, off] = mapSoA(flatArrayIndex, rc, MBDst);
                return &dstView.storageBlobs[blob][off];
            }
        };

        static constexpr auto L = []
        {
            if constexpr(srcIsAoSoA && dstIsAoSoA)
                return std::gcd(LanesSrc, LanesDst);
            return std::min(LanesSrc, LanesDst);
        }();
        if(readOpt)
        {
            // optimized for linear reading
            constexpr auto srcL = srcIsAoSoA ? LanesSrc : L;
            const auto elementsPerThread = flatSize / srcL / threadCount * srcL;
            {
                const auto start = threadId * elementsPerThread;
                const auto stop = threadId == threadCount - 1 ? flatSize : (threadId + 1) * elementsPerThread;

                auto copyLBlock = [&](const std::byte*& threadSrc, std::size_t dstIndex, auto rc) LLAMA_LAMBDA_INLINE
                {
                    constexpr auto bytes = L * sizeof(GetType<RecordDim, decltype(rc)>);
                    std::memcpy(mapDst(dstIndex, rc), threadSrc, bytes);
                    threadSrc += bytes;
                };
                if constexpr(srcIsAoSoA)
                {
                    auto* threadSrc = mapSrc(start, RecordCoord<>{});
                    for(std::size_t i = start; i < stop; i += LanesSrc)
                        forEachLeafCoord<RecordDim>(
                            [&](auto rc) LLAMA_LAMBDA_INLINE
                            {
                                for(std::size_t j = 0; j < LanesSrc; j += L)
                                    copyLBlock(threadSrc, i + j, rc);
                            });
                }
                else
                {
                    forEachLeafCoord<RecordDim>(
                        [&](auto rc) LLAMA_LAMBDA_INLINE
                        {
                            auto* threadSrc = mapSrc(start, rc);
                            for(std::size_t i = start; i < stop; i += L)
                                copyLBlock(threadSrc, i, rc);
                        });
                }
            }
        }
        else
        {
            // optimized for linear writing
            constexpr auto dstL = dstIsAoSoA ? LanesDst : L;
            const auto elementsPerThread = flatSize / dstL / threadCount * dstL;
            {
                const auto start = threadId * elementsPerThread;
                const auto stop = threadId == threadCount - 1 ? flatSize : (threadId + 1) * elementsPerThread;

                auto copyLBlock = [&](std::byte*& threadDst, std::size_t srcIndex, auto rc) LLAMA_LAMBDA_INLINE
                {
                    constexpr auto bytes = L * sizeof(GetType<RecordDim, decltype(rc)>);
                    std::memcpy(threadDst, mapSrc(srcIndex, rc), bytes);
                    threadDst += bytes;
                };
                if constexpr(dstIsAoSoA)
                {
                    auto* threadDst = mapDst(start, RecordCoord<>{});
                    for(std::size_t i = start; i < stop; i += LanesDst)
                        forEachLeafCoord<RecordDim>(
                            [&](auto rc) LLAMA_LAMBDA_INLINE
                            {
                                for(std::size_t j = 0; j < LanesDst; j += L)
                                    copyLBlock(threadDst, i + j, rc);
                            });
                }
                else
                {
                    forEachLeafCoord<RecordDim>(
                        [&](auto rc) LLAMA_LAMBDA_INLINE
                        {
                            auto* threadDst = mapDst(start, rc);
                            for(std::size_t i = start; i < stop; i += L)
                                copyLBlock(threadDst, i, rc);
                        });
                }
            }
        }
    }

    /// @brief Generic implementation of \ref copy defaulting to \ref fieldWiseCopy. LLAMA provides several
    /// specializations of this construct for specific mappings. Users are encouraged to also specialize this template
    /// with better copy algorithms for further combinations of mappings, if they can and want to provide a better
    /// implementation.
    template<typename SrcMapping, typename DstMapping, typename SFINAE = void>
    struct Copy
    {
        template<typename SrcView, typename DstView>
        void operator()(const SrcView& srcView, DstView& dstView, std::size_t threadId, std::size_t threadCount) const
        {
            fieldWiseCopy(srcView, dstView, threadId, threadCount);
        }
    };

    template<typename Mapping>
    struct Copy<Mapping, Mapping>
    {
        template<typename SrcView, typename DstView>
        void operator()(const SrcView& srcView, DstView& dstView, std::size_t threadId, std::size_t threadCount) const
        {
            blobMemcpy(srcView, dstView, threadId, threadCount);
        }
    };

    template<
        typename ArrayExtents,
        typename RecordDim,
        typename LinearizeArrayDims,
        std::size_t LanesSrc,
        std::size_t LanesDst>
    struct Copy<
        mapping::AoSoA<ArrayExtents, RecordDim, LanesSrc, LinearizeArrayDims>,
        mapping::AoSoA<ArrayExtents, RecordDim, LanesDst, LinearizeArrayDims>,
        std::enable_if_t<LanesSrc != LanesDst>>
    {
        template<typename SrcBlob, typename DstBlob>
        void operator()(
            const View<mapping::AoSoA<ArrayExtents, RecordDim, LanesSrc, LinearizeArrayDims>, SrcBlob>& srcView,
            View<mapping::AoSoA<ArrayExtents, RecordDim, LanesDst, LinearizeArrayDims>, DstBlob>& dstView,
            std::size_t threadId,
            std::size_t threadCount)
        {
            constexpr auto readOpt = true; // TODO(bgruber): how to choose?
            aosoaCommonBlockCopy(srcView, dstView, readOpt, threadId, threadCount);
        }
    };

    template<
        typename ArrayExtents,
        typename RecordDim,
        typename LinearizeArrayDims,
        std::size_t LanesSrc,
        bool DstSeparateBuffers>
    struct Copy<
        mapping::AoSoA<ArrayExtents, RecordDim, LanesSrc, LinearizeArrayDims>,
        mapping::SoA<ArrayExtents, RecordDim, DstSeparateBuffers, LinearizeArrayDims>>
    {
        template<typename SrcBlob, typename DstBlob>
        void operator()(
            const View<mapping::AoSoA<ArrayExtents, RecordDim, LanesSrc, LinearizeArrayDims>, SrcBlob>& srcView,
            View<mapping::SoA<ArrayExtents, RecordDim, DstSeparateBuffers, LinearizeArrayDims>, DstBlob>& dstView,
            std::size_t threadId,
            std::size_t threadCount)
        {
            constexpr auto readOpt = true; // TODO(bgruber): how to choose?
            aosoaCommonBlockCopy(srcView, dstView, readOpt, threadId, threadCount);
        }
    };

    template<
        typename ArrayExtents,
        typename RecordDim,
        typename LinearizeArrayDims,
        std::size_t LanesDst,
        bool SrcSeparateBuffers>
    struct Copy<
        mapping::SoA<ArrayExtents, RecordDim, SrcSeparateBuffers, LinearizeArrayDims>,
        mapping::AoSoA<ArrayExtents, RecordDim, LanesDst, LinearizeArrayDims>>
    {
        template<typename SrcBlob, typename DstBlob>
        void operator()(
            const View<mapping::SoA<ArrayExtents, RecordDim, SrcSeparateBuffers, LinearizeArrayDims>, SrcBlob>&
                srcView,
            View<mapping::AoSoA<ArrayExtents, RecordDim, LanesDst, LinearizeArrayDims>, DstBlob>& dstView,
            std::size_t threadId,
            std::size_t threadCount)
        {
            constexpr auto readOpt = true; // TODO(bgruber): how to choose?
            aosoaCommonBlockCopy(srcView, dstView, readOpt, threadId, threadCount);
        }
    };

    /// Copy data from source view to destination view. Both views need to have the same array and record
    /// dimensions. Delegates to \ref Copy to choose an implementation.
    /// @param threadId Optional. Zero-based id of calling thread for multi-threaded invocations.
    /// @param threadCount Optional. Thread count in case of multi-threaded invocation.
    template<typename SrcMapping, typename SrcBlob, typename DstMapping, typename DstBlob>
    void copy(
        const View<SrcMapping, SrcBlob>& srcView,
        View<DstMapping, DstBlob>& dstView,
        std::size_t threadId = 0,
        std::size_t threadCount = 1)
    {
        Copy<SrcMapping, DstMapping>{}(srcView, dstView, threadId, threadCount);
    }
} // namespace llama
// ==
// == ./Copy.hpp ==
// ============================================================================

// ============================================================================
// == ./DumpMapping.hpp ==
// ==
// SPDX-License-Identifier: GPL-3.0-or-later

// #pragma once
#if !__has_include(<fmt/format.h>)
#    error DumpMapping.hpp requires the fmt library
#endif

// #include "ArrayIndexRange.hpp"    // amalgamate: file already expanded
// #include "Core.hpp"    // amalgamate: file already expanded

#include <boost/functional/hash.hpp>
#include <fmt/format.h>
// #include <string>    // amalgamate: file already included
// #include <vector>    // amalgamate: file already included

namespace llama
{
    namespace internal
    {
        template<std::size_t... Coords>
        auto toVec(RecordCoord<Coords...>) -> std::vector<std::size_t>
        {
            return {Coords...};
        }

        inline auto color(const std::vector<std::size_t>& recordCoord) -> std::size_t
        {
            auto c = boost::hash<std::vector<std::size_t>>{}(recordCoord) &0xFFFFFF;
            c |= 0x404040; // ensure color per channel is at least 0x40.
            return c;
        }

        template<std::size_t Dim>
        auto formatArrayIndex(const ArrayIndex<Dim>& ai)
        {
            if constexpr(Dim == 1)
                return std::to_string(ai[0]);
            else
            {
                std::string s = "{";
                for(auto v : ai)
                {
                    if(s.size() >= 2)
                        s += ",";
                    s += std::to_string(v);
                }
                s += "}";
                return s;
            }
        }

        template<std::size_t Dim>
        struct FieldBox
        {
            ArrayIndex<Dim> arrayIndex;
            std::vector<std::size_t> recordCoord;
            std::string recordTags;
            NrAndOffset nrAndOffset;
            std::size_t size;
        };

        template<typename Mapping>
        auto boxesFromMapping(const Mapping& mapping) -> std::vector<FieldBox<Mapping::ArrayIndex::rank>>
        {
            std::vector<FieldBox<Mapping::ArrayIndex::rank>> infos;

            using RecordDim = typename Mapping::RecordDim;
            for(auto ai : ArrayIndexRange{mapping.extents()})
            {
                forEachLeafCoord<RecordDim>(
                    [&](auto rc)
                    {
                        infos.push_back(
                            {ai,
                             internal::toVec(rc),
                             recordCoordTags<RecordDim>(rc),
                             mapping.blobNrAndOffset(ai, rc),
                             sizeof(GetType<RecordDim, decltype(rc)>)});
                    });
            }

            return infos;
        }

        template<std::size_t Dim>
        auto breakBoxes(std::vector<FieldBox<Dim>> boxes, std::size_t wrapByteCount) -> std::vector<FieldBox<Dim>>
        {
            for(std::size_t i = 0; i < boxes.size(); i++)
            {
                auto& fb = boxes[i];
                if(fb.nrAndOffset.offset / wrapByteCount != (fb.nrAndOffset.offset + fb.size - 1) / wrapByteCount)
                {
                    const auto remainingSpace = wrapByteCount - fb.nrAndOffset.offset % wrapByteCount;
                    auto newFb = fb;
                    newFb.nrAndOffset.offset = fb.nrAndOffset.offset + remainingSpace;
                    newFb.size = fb.size - remainingSpace;
                    fb.size = remainingSpace;
                    boxes.push_back(newFb);
                }
            }
            return boxes;
        }

        inline auto cssClass(std::string tags)
        {
            std::replace(begin(tags), end(tags), '.', '_');
            std::replace(begin(tags), end(tags), '<', '_');
            std::replace(begin(tags), end(tags), '>', '_');
            return tags;
        };
    } // namespace internal

    /// Returns an SVG image visualizing the memory layout created by the given mapping. The created memory blocks are
    /// wrapped after wrapByteCount bytes.
    template<typename Mapping>
    auto toSvg(const Mapping& mapping, std::size_t wrapByteCount = 64, bool breakBoxes = true) -> std::string
    {
        constexpr auto byteSizeInPixel = 30;
        constexpr auto blobBlockWidth = 60;

        auto infos = internal::boxesFromMapping(mapping);
        if(breakBoxes)
            infos = internal::breakBoxes(std::move(infos), wrapByteCount);

        std::string svg;

        std::array<int, Mapping::blobCount + 1> blobYOffset{};
        for(std::size_t i = 0; i < Mapping::blobCount; i++)
        {
            const auto blobRows = (mapping.blobSize(i) + wrapByteCount - 1) / wrapByteCount;
            blobYOffset[i + 1] = blobYOffset[i] + (blobRows + 1) * byteSizeInPixel; // one row gap between blobs
            const auto height = blobRows * byteSizeInPixel;
            svg += fmt::format(
                R"a(<rect x="0" y="{}" width="{}" height="{}" fill="#AAA" stroke="#000"/>
<text x="{}" y="{}" fill="#000" text-anchor="middle">Blob: {}</text>
)a",
                blobYOffset[i],
                blobBlockWidth,
                height,
                blobBlockWidth / 2,
                blobYOffset[i] + height / 2,
                i);
        }

        svg = fmt::format(
                  R"(<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<svg width="{}" height="{}" xmlns="http://www.w3.org/2000/svg">
    <style>
        .label {{ font: {}px sans-serif; }}
    </style>
)",
                  blobBlockWidth + wrapByteCount * byteSizeInPixel,
                  blobYOffset.back() - byteSizeInPixel,
                  byteSizeInPixel / 2)
            + svg;

        for(const auto& info : infos)
        {
            const auto blobY = blobYOffset[info.nrAndOffset.nr];
            auto x = (info.nrAndOffset.offset % wrapByteCount) * byteSizeInPixel + blobBlockWidth;
            auto y = (info.nrAndOffset.offset / wrapByteCount) * byteSizeInPixel + blobY;
            const auto fill = internal::color(info.recordCoord);
            const auto width = byteSizeInPixel * info.size;

            constexpr auto cropBoxes = true;
            if(cropBoxes)
            {
                svg += fmt::format(
                    R"(<svg x="{}" y="{}" width="{}" height="{}">
)",
                    x,
                    y,
                    width,
                    byteSizeInPixel);
                x = 0;
                y = 0;
            }
            svg += fmt::format(
                R"(<rect x="{}" y="{}" width="{}" height="{}" fill="#{:X}" stroke="#000"/>
)",
                x,
                y,
                width,
                byteSizeInPixel,
                fill);
            for(std::size_t i = 1; i < info.size; i++)
            {
                svg += fmt::format(
                    R"(<line x1="{}" y1="{}" x2="{}" y2="{}" stroke="#777"/>
)",
                    x + i * byteSizeInPixel,
                    y + byteSizeInPixel * 2 / 3,
                    x + i * byteSizeInPixel,
                    y + byteSizeInPixel);
            }
            svg += fmt::format(
                R"(<text x="{}" y="{}" fill="#000" text-anchor="middle" class="label">{} {}</text>
)",
                x + width / 2,
                y + byteSizeInPixel * 3 / 4,
                internal::formatArrayIndex(info.arrayIndex),
                info.recordTags);
            if(cropBoxes)
                svg += R"(</svg>
)";
        }
        svg += "</svg>";
        return svg;
    }

    /// Returns an HTML document visualizing the memory layout created by the given mapping. The visualization is
    /// resizeable.
    template<typename Mapping>
    auto toHtml(const Mapping& mapping) -> std::string
    {
        constexpr auto byteSizeInPixel = 30;
        constexpr auto rulerLengthInBytes = 512;
        constexpr auto rulerByteInterval = 8;

        auto infos = internal::boxesFromMapping(mapping);
        std::stable_sort(
            begin(infos),
            end(infos),
            [](const auto& a, const auto& b) {
                return std::tie(a.nrAndOffset.nr, a.nrAndOffset.offset)
                    < std::tie(b.nrAndOffset.nr, b.nrAndOffset.offset);
            });
        infos.erase(
            std::unique(
                begin(infos),
                end(infos),
                [](const auto& a, const auto& b) { return a.nrAndOffset == b.nrAndOffset; }),
            end(infos));

        std::string html;
        html += fmt::format(
            R"(<!DOCTYPE html>
<html>
<head>
<style>
.box {{
    outline: 1px solid;
    display: inline-block;
    white-space: nowrap;
    height: {}px;
    background: repeating-linear-gradient(90deg, #0000, #0000 29px, #777 29px, #777 30px);
    text-align: center;
    overflow: hidden;
    vertical-align: middle;
}}
#ruler {{
    background: repeating-linear-gradient(90deg, #0000, #0000 29px, #000 29px, #000 30px);
    border-bottom: 1px solid;
    height: 20px;
    margin-bottom: 20px;
}}
#ruler div {{
    position: absolute;
    display: inline-block;
}}
)",
            byteSizeInPixel);
        using RecordDim = typename Mapping::RecordDim;
        forEachLeafCoord<RecordDim>(
            [&](auto rc)
            {
                constexpr int size = sizeof(GetType<RecordDim, decltype(rc)>);

                html += fmt::format(
                    R"(.{} {{
    width: {}px;
    background-color: #{:X};
}}
)",
                    internal::cssClass(recordCoordTags<RecordDim>(rc)),
                    byteSizeInPixel * size,
                    internal::color(internal::toVec(rc)));
            });

        html += fmt::format(R"(</style>
</head>
<body>
    <header id="ruler">
)");
        for(auto i = 0; i < rulerLengthInBytes; i += rulerByteInterval)
            html += fmt::format(
                R"(</style>
        <div style="margin-left: {}px;">{}</div>)",
                i * byteSizeInPixel,
                i);
        html += fmt::format(R"(
    </header>
)");

        auto currentBlobNr = std::numeric_limits<std::size_t>::max();
        for(const auto& info : infos)
        {
            if(currentBlobNr != info.nrAndOffset.nr)
            {
                currentBlobNr = info.nrAndOffset.nr;
                html += fmt::format("<h1>Blob: {}</h1>", currentBlobNr);
            }
            html += fmt::format(
                R"(<div class="box {0}" title="{1} {2}">{1} {2}</div>)",
                internal::cssClass(info.recordTags),
                internal::formatArrayIndex(info.arrayIndex),
                info.recordTags);
        }
        html += R"(</body>
</html>)";
        return html;
    }
} // namespace llama
// ==
// == ./DumpMapping.hpp ==
// ============================================================================

// ============================================================================
// == ./llama.hpp ==
// ==
// Copyright 2018 Alexander Matthes
// SPDX-License-Identifier: GPL-3.0-or-later

// #pragma once
/// \mainpage LLAMA API documentation
///
/// LLAMA is a C++17 template header-only library for the abstraction of memory access patterns. It distinguishes
/// between the view of the algorithm on the memory and the real layout in the background. This enables performance
/// portability for multicore, manycore and gpu applications with the very same code.
///
/// In contrast to many other solutions LLAMA can define nested data structures of arbitrary depths and is not limited
/// only to struct of array and array of struct data layouts. It is also capable to explicitly define padding,
/// blocking, striding and any other run time or compile time access pattern simultaneously.
///
/// To archieve this goal LLAMA is split into mostly independent, orthogonal parts completely written in modern C++17
/// to run on as many architectures and with as many compilers as possible while still supporting extensions needed
/// e.g. to run on GPU or other many core hardware.
///
/// This page documents the API of LLAMA. The user documentation and an overview about the concepts and ideas can be
/// found here: https://llama-doc.rtfd.io
///
/// LLAMA is licensed under the LGPL3+.

#define LLAMA_VERSION_MAJOR 0
#define LLAMA_VERSION_MINOR 3
#define LLAMA_VERSION_PATCH 0

#ifdef __NVCC__
#    pragma push
#    if __CUDACC_VER_MAJOR__ * 1000 + __CUDACC_VER_MINOR__ >= 11005
#        pragma nv_diag_suppress 940
#    else
#        pragma diag_suppress 940
#    endif
#endif

// #include "ArrayExtents.hpp"    // amalgamate: file already expanded
// #include "ArrayIndexRange.hpp"    // amalgamate: file already expanded
// #include "BlobAllocators.hpp"    // amalgamate: file already expanded
// #include "Copy.hpp"    // amalgamate: file already expanded
// #include "Core.hpp"    // amalgamate: file already expanded
// #include "Meta.hpp"    // amalgamate: file already expanded
// #include "Vector.hpp"    // amalgamate: file already expanded
// #include "View.hpp"    // amalgamate: file already expanded
// #include "VirtualRecord.hpp"    // amalgamate: file already expanded
// #include "macros.hpp"    // amalgamate: file already expanded
	// ============================================================================
	// == ./mapping/AoS.hpp ==
	// ==
	// Copyright 2018 Alexander Matthes
	// SPDX-License-Identifier: GPL-3.0-or-later

	// #pragma once
	// #include "Common.hpp"    // amalgamate: file already expanded

	namespace llama::mapping
	{
	    /// Array of struct mapping. Used to create a \ref View via \ref allocView.
	    /// \tparam AlignAndPad If true, padding bytes are inserted to guarantee that struct members are properly aligned.
	    /// If false, struct members are tightly packed.
	    /// \tparam T_LinearizeArrayDimsFunctor Defines how the array dimensions should be mapped into linear numbers and
	    /// how big the linear domain gets.
	    /// \tparam FlattenRecordDim Defines how the record dimension's fields should be flattened. See \ref
	    /// FlattenRecordDimInOrder, \ref FlattenRecordDimIncreasingAlignment, \ref FlattenRecordDimDecreasingAlignment and
	    /// \ref FlattenRecordDimMinimizePadding.
	    template<
	        typename TArrayExtents,
	        typename TRecordDim,
	        bool AlignAndPad = true,
	        typename TLinearizeArrayDimsFunctor = LinearizeArrayDimsCpp,
	        template<typename> typename FlattenRecordDim = FlattenRecordDimInOrder>
	    struct AoS : private TArrayExtents
	    {
	        using ArrayExtents = TArrayExtents;
	        using ArrayIndex = typename ArrayExtents::Index;
	        using RecordDim = TRecordDim;
	        using LinearizeArrayDimsFunctor = TLinearizeArrayDimsFunctor;
	        static constexpr std::size_t blobCount = 1;

	        constexpr AoS() = default;

	        LLAMA_FN_HOST_ACC_INLINE
	        constexpr explicit AoS(ArrayExtents extents, RecordDim = {}) : ArrayExtents(extents)
	        {
	        }

	        LLAMA_FN_HOST_ACC_INLINE constexpr auto extents() const -> ArrayExtents
	        {
	            return *this;
	        }

	        LLAMA_FN_HOST_ACC_INLINE constexpr auto blobSize(std::size_t) const -> std::size_t
	        {
	            return LinearizeArrayDimsFunctor{}.size(extents())
	                * flatSizeOf<typename Flattener::FlatRecordDim, AlignAndPad>;
	        }

	        template<std::size_t... RecordCoords>
	        LLAMA_FN_HOST_ACC_INLINE constexpr auto blobNrAndOffset(ArrayIndex ai, RecordCoord<RecordCoords...> = {}) const
	            -> NrAndOffset
	        {
	            constexpr std::size_t flatFieldIndex =
	#ifdef __NVCC__
	                *& // mess with nvcc compiler state to workaround bug
	#endif
	                 Flattener::template flatIndex<RecordCoords...>;
	            const auto offset
	                = LinearizeArrayDimsFunctor{}(ai, extents())
	                    * flatSizeOf<
	                        typename Flattener::FlatRecordDim,
	                        AlignAndPad> + flatOffsetOf<typename Flattener::FlatRecordDim, flatFieldIndex, AlignAndPad>;
	            return {0, offset};
	        }

	    private:
	        using Flattener = FlattenRecordDim<TRecordDim>;
	    };

	    /// Array of struct mapping preserving the alignment of the field types by inserting padding.
	    /// \see AoS
	    template<typename ArrayExtents, typename RecordDim, typename LinearizeArrayDimsFunctor = LinearizeArrayDimsCpp>
	    using AlignedAoS = AoS<ArrayExtents, RecordDim, true, LinearizeArrayDimsFunctor>;

	    /// Array of struct mapping preserving the alignment of the field types by inserting padding and permuting the
	    /// field order to minimize this padding. \see AoS
	    template<typename ArrayExtents, typename RecordDim, typename LinearizeArrayDimsFunctor = LinearizeArrayDimsCpp>
	    using MinAlignedAoS
	        = AoS<ArrayExtents, RecordDim, true, LinearizeArrayDimsFunctor, FlattenRecordDimMinimizePadding>;

	    /// Array of struct mapping packing the field types tightly, violating the types alignment requirements.
	    /// \see AoS
	    template<typename ArrayExtents, typename RecordDim, typename LinearizeArrayDimsFunctor = LinearizeArrayDimsCpp>
	    using PackedAoS = AoS<ArrayExtents, RecordDim, false, LinearizeArrayDimsFunctor>;

	    template<bool AlignAndPad = true, typename LinearizeArrayDimsFunctor = LinearizeArrayDimsCpp>
	    struct PreconfiguredAoS
	    {
	        template<typename ArrayExtents, typename RecordDim>
	        using type = AoS<ArrayExtents, RecordDim, AlignAndPad, LinearizeArrayDimsFunctor>;
	    };

	    template<typename Mapping>
	    inline constexpr bool isAoS = false;

	    template<
	        typename ArrayExtents,
	        typename RecordDim,
	        bool AlignAndPad,
	        typename LinearizeArrayDimsFunctor,
	        template<typename>
	        typename FlattenRecordDim>
	    inline constexpr bool
	        isAoS<AoS<ArrayExtents, RecordDim, AlignAndPad, LinearizeArrayDimsFunctor, FlattenRecordDim>> = true;
	} // namespace llama::mapping
	// ==
	// == ./mapping/AoS.hpp ==
	// ============================================================================

// #include "mapping/AoSoA.hpp"    // amalgamate: file already expanded
	// ============================================================================
	// == ./mapping/Bytesplit.hpp ==
	// ==
	// SPDX-License-Identifier: GPL-3.0-or-later

	// #pragma once
	// #include "Common.hpp"    // amalgamate: file already expanded

	namespace llama::mapping
	{
	    namespace internal
	    {
	        template<typename T>
	        using ReplaceByByteArray = std::byte[sizeof(T)];

	        template<typename RecordDim>
	        using SplitBytes = TransformLeaves<RecordDim, ReplaceByByteArray>;
	    } // namespace internal

	    template<typename TArrayExtents, typename TRecordDim, template<typename, typename> typename InnerMapping>
	    struct Bytesplit : private InnerMapping<TArrayExtents, internal::SplitBytes<TRecordDim>>
	    {
	        using Inner = InnerMapping<TArrayExtents, internal::SplitBytes<TRecordDim>>;

	        using ArrayExtents = typename Inner::ArrayExtents;
	        using ArrayIndex = typename Inner::ArrayIndex;
	        using RecordDim = TRecordDim; // hide Inner::RecordDim
	        using Inner::blobCount;

	        using Inner::blobSize;
	        using Inner::extents;
	        using Inner::Inner;

	        LLAMA_FN_HOST_ACC_INLINE
	        constexpr explicit Bytesplit(TArrayExtents extents, TRecordDim = {}) : Inner(extents)
	        {
	        }

	        template<std::size_t... RecordCoords>
	        static constexpr auto isComputed(RecordCoord<RecordCoords...>)
	        {
	            return true;
	        }

	        template<typename QualifiedBase, typename RC, typename BlobArray>
	        struct Reference
	        {
	            QualifiedBase& innerMapping;
	            ArrayIndex ai;
	            BlobArray& blobs;

	            using DstType = GetType<TRecordDim, RC>;

	            // NOLINTNEXTLINE(google-explicit-constructor,hicpp-explicit-conversions)
	            operator DstType() const
	            {
	                DstType v;
	                auto* p = reinterpret_cast<std::byte*>(&v);
	                boost::mp11::mp_for_each<boost::mp11::mp_iota_c<sizeof(DstType)>>(
	                    [&](auto ic)
	                    {
	                        constexpr auto i = decltype(ic)::value;
	                        const auto [nr, off] = innerMapping.blobNrAndOffset(ai, Cat<RC, RecordCoord<i>>{});
	                        p[i] = blobs[nr][off];
	                    });
	                return v;
	            }

	            auto operator=(DstType v) -> Reference&
	            {
	                auto* p = reinterpret_cast<std::byte*>(&v);
	                boost::mp11::mp_for_each<boost::mp11::mp_iota_c<sizeof(DstType)>>(
	                    [&](auto ic)
	                    {
	                        constexpr auto i = decltype(ic)::value;
	                        const auto [nr, off] = innerMapping.blobNrAndOffset(ai, Cat<RC, RecordCoord<i>>{});
	                        blobs[nr][off] = p[i];
	                    });
	                return *this;
	            }
	        };

	        template<std::size_t... RecordCoords, typename BlobArray>
	        LLAMA_FN_HOST_ACC_INLINE constexpr auto compute(
	            typename Inner::ArrayIndex ai,
	            RecordCoord<RecordCoords...>,
	            BlobArray& blobs) const
	        {
	            return Reference<decltype(*this), RecordCoord<RecordCoords...>, BlobArray>{*this, ai, blobs};
	        }
	    };
	} // namespace llama::mapping
	// ==
	// == ./mapping/Bytesplit.hpp ==
	// ============================================================================

	// ============================================================================
	// == ./mapping/Heatmap.hpp ==
	// ==
	// #pragma once
	// #include "Common.hpp"    // amalgamate: file already expanded

	// #include <array>    // amalgamate: file already included
	#include <atomic>
	#include <sstream>
	// #include <vector>    // amalgamate: file already included

	namespace llama::mapping
	{
	    /// Forwards all calls to the inner mapping. Counts all accesses made to all bytes, allowing to extract a heatmap.
	    /// \tparam Mapping The type of the inner mapping.
	    template<typename Mapping, typename CountType = std::size_t>
	    struct Heatmap
	    {
	        using ArrayExtents = typename Mapping::ArrayExtents;
	        using ArrayIndex = typename Mapping::ArrayIndex;
	        using RecordDim = typename Mapping::RecordDim;
	        static constexpr std::size_t blobCount = Mapping::blobCount;

	        constexpr Heatmap() = default;

	        LLAMA_FN_HOST_ACC_INLINE
	        explicit Heatmap(Mapping mapping) : mapping(mapping)
	        {
	            for(std::size_t i = 0; i < blobCount; i++)
	                byteHits[i] = std::vector<std::atomic<CountType>>(blobSize(i));
	        }

	        Heatmap(const Heatmap&) = delete;
	        auto operator=(const Heatmap&) -> Heatmap& = delete;

	        Heatmap(Heatmap&&) noexcept = default;
	        auto operator=(Heatmap&&) noexcept -> Heatmap& = default;

	        ~Heatmap() = default;

	        LLAMA_FN_HOST_ACC_INLINE constexpr auto extents() const -> ArrayExtents
	        {
	            return mapping.extents();
	        }

	        LLAMA_FN_HOST_ACC_INLINE constexpr auto blobSize(std::size_t i) const -> std::size_t
	        {
	            LLAMA_FORCE_INLINE_RECURSIVE
	            return mapping.blobSize(i);
	        }

	        template<std::size_t... RecordCoords>
	        LLAMA_FN_HOST_ACC_INLINE auto blobNrAndOffset(ArrayIndex ai, RecordCoord<RecordCoords...> rc = {}) const
	            -> NrAndOffset
	        {
	            const auto nao = mapping.blobNrAndOffset(ai, rc);
	            for(std::size_t i = 0; i < sizeof(GetType<RecordDim, RecordCoord<RecordCoords...>>); i++)
	                byteHits[nao.nr][nao.offset + i]++;
	            return nao;
	        }

	        auto toGnuplotScript(std::size_t wrapAfterBytes = 64) const -> std::string
	        {
	            std::stringstream f;
	            f << "#!/usr/bin/gnuplot -p\n$data << EOD\n";
	            for(std::size_t i = 0; i < blobCount; i++)
	            {
	                std::size_t byteCount = 0;
	                for(const auto& hits : byteHits[i])
	                    f << hits << ((++byteCount % wrapAfterBytes == 0) ? '\n' : ' ');
	                while(byteCount++ % wrapAfterBytes != 0)
	                    f << "0 ";
	                f << '\n';
	            }
	            f << R"(EOD
	set view map
	set xtics format ""
	set x2tics autofreq 8
	set yrange [] reverse
	set link x2; set link y2
	set ylabel "Cacheline"
	set x2label "Byte"
	plot $data matrix with image axes x2y1
	)";
	            return f.str();
	        }

	        Mapping mapping;
	        mutable std::array<std::vector<std::atomic<CountType>>, blobCount> byteHits;
	    };
	} // namespace llama::mapping
	// ==
	// == ./mapping/Heatmap.hpp ==
	// ============================================================================

// #include "mapping/One.hpp"    // amalgamate: file already expanded
// #include "mapping/SoA.hpp"    // amalgamate: file already expanded
	// ============================================================================
	// == ./mapping/Split.hpp ==
	// ==
	// #pragma once
	// #include "Common.hpp"    // amalgamate: file already expanded

	namespace llama::mapping
	{
	    namespace internal
	    {
	        template<typename... Fields, std::size_t FirstCoord, std::size_t... Coords>
	        auto partitionRecordDim(Record<Fields...>, RecordCoord<FirstCoord, Coords...>)
	        {
	            using namespace boost::mp11;
	            using Rec = Record<Fields...>;
	            if constexpr(sizeof...(Coords) == 0)
	            {
	                using Part1 = Record<mp_at_c<Rec, FirstCoord>>;
	                using Part2 = mp_erase_c<Rec, FirstCoord, FirstCoord + 1>;
	                return mp_list<Part1, Part2>{};
	            }
	            else
	            {
	                using FieldTag = GetTag<Rec, RecordCoord<FirstCoord>>;
	                using FieldType = GetType<Rec, RecordCoord<FirstCoord>>;
	                using InnerPartition = decltype(partitionRecordDim(FieldType{}, RecordCoord<Coords...>{}));
	                using Part1 = Record<Field<FieldTag, mp_first<InnerPartition>>>;
	                using Part2 = mp_replace_at_c<Rec, FirstCoord, Field<FieldTag, mp_second<InnerPartition>>>;
	                return mp_list<Part1, Part2>{};
	            }
	        }

	        template<typename Acc, typename TagList>
	        struct PartitionFoldOpImpl
	        {
	            using Part1Before = boost::mp11::mp_first<Acc>;
	            using Part2Before = boost::mp11::mp_second<Acc>;
	            using R = decltype(partitionRecordDim(Part2Before{}, GetCoordFromTags<Part2Before, TagList>{}));
	            using Part1After = boost::mp11::mp_first<R>;
	            using Part2After = boost::mp11::mp_second<R>;

	            using type = boost::mp11::mp_list<MergedRecordDims<Part1Before, Part1After>, Part2After>;
	        };

	        template<typename Acc, typename TagList>
	        using PartitionFoldOp = typename PartitionFoldOpImpl<Acc, TagList>::type;

	        template<typename... Fields, typename... RCs>
	        auto partitionRecordDim(Record<Fields...>, boost::mp11::mp_list<RCs...>)
	        {
	            using namespace boost::mp11;
	            using Initial = mp_list<Record<>, Record<Fields...>>; // initially, nothing selected for mapping 1
	            return mp_fold<mp_list<GetTags<Record<Fields...>, RCs>...>, Initial, PartitionFoldOp>{};
	        }

	        // workaround for nvcc 11.3 and below: we cannot put the decltype() directly into the Split class
	        template<typename RecordDim, typename RecordCoordForMapping1>
	        struct PartionedRecordDim
	        {
	            using type = decltype(partitionRecordDim(RecordDim{}, RecordCoordForMapping1{}));
	        };

	        template<typename RC, typename RecordCoordForMapping1>
	        inline constexpr bool isSelected = RecordCoordCommonPrefixIsSame<RecordCoordForMapping1, RC>;

	        template<typename RC>
	        struct IsSelectedPredicate
	        {
	            template<typename RecordCoordForMapping1>
	            using fn = boost::mp11::mp_bool<isSelected<RC, RecordCoordForMapping1>>;
	        };

	        template<typename RC, typename... RecordCoordsForMapping1>
	        inline constexpr bool isSelected<RC, boost::mp11::mp_list<RecordCoordsForMapping1...>> = boost::mp11::
	            mp_any_of_q<boost::mp11::mp_list<RecordCoordsForMapping1...>, IsSelectedPredicate<RC>>::value;
	    } // namespace internal

	    /// Mapping which splits off a part of the record dimension and maps it differently then the rest.
	    /// \tparam RecordCoordForMapping1 A \ref RecordCoord or a list of RecordCoords selecting the part of the record
	    /// dimension to be mapped differently.
	    /// \tparam MappingTemplate1 The mapping used for the selected part of the record dimension.
	    /// \tparam MappingTemplate2 The mapping used for the not selected part of the record dimension.
	    /// \tparam SeparateBlobs If true, both pieces of the record dimension are mapped to separate blobs.
	    template<
	        typename TArrayExtents,
	        typename TRecordDim,
	        typename RecordCoordForMapping1,
	        template<typename...>
	        typename MappingTemplate1,
	        template<typename...>
	        typename MappingTemplate2,
	        bool SeparateBlobs = false>
	    struct Split
	    {
	        using ArrayExtents = TArrayExtents;
	        using ArrayIndex = typename ArrayExtents::Index;
	        using RecordDim = TRecordDim;

	        using RecordDimPartitions = typename internal::PartionedRecordDim<RecordDim, RecordCoordForMapping1>::type;
	        using RecordDim1 = boost::mp11::mp_first<RecordDimPartitions>;
	        using RecordDim2 = boost::mp11::mp_second<RecordDimPartitions>;

	        using Mapping1 = MappingTemplate1<ArrayExtents, RecordDim1>;
	        using Mapping2 = MappingTemplate2<ArrayExtents, RecordDim2>;

	        static constexpr std::size_t blobCount = SeparateBlobs ? Mapping1::blobCount + Mapping2::blobCount : 1;
	        static_assert(SeparateBlobs || Mapping1::blobCount == 1);
	        static_assert(SeparateBlobs || Mapping2::blobCount == 1);

	        constexpr Split() = default;

	        LLAMA_FN_HOST_ACC_INLINE
	        constexpr explicit Split(ArrayExtents extents) : mapping1(extents), mapping2(extents)
	        {
	        }

	        LLAMA_FN_HOST_ACC_INLINE constexpr auto extents() const -> ArrayExtents
	        {
	            return mapping1.extents();
	        }

	        LLAMA_FN_HOST_ACC_INLINE constexpr auto blobSize([[maybe_unused]] std::size_t i) const -> std::size_t
	        {
	            if constexpr(SeparateBlobs)
	            {
	                if(i < Mapping1::blobCount)
	                    return mapping1.blobSize(i);
	                return mapping2.blobSize(i - Mapping1::blobCount);
	            }
	            else
	                return mapping1.blobSize(0) + mapping2.blobSize(0);
	        }

	        template<std::size_t... RecordCoords>
	        LLAMA_FN_HOST_ACC_INLINE constexpr auto blobNrAndOffset(ArrayIndex ai, RecordCoord<RecordCoords...> = {}) const
	            -> NrAndOffset
	        {
	            using Tags = GetTags<RecordDim, RecordCoord<RecordCoords...>>;

	            if constexpr(internal::isSelected<RecordCoord<RecordCoords...>, RecordCoordForMapping1>)
	                return mapping1.blobNrAndOffset(ai, GetCoordFromTags<RecordDim1, Tags>{});
	            else
	            {
	                auto nrAndOffset = mapping2.blobNrAndOffset(ai, GetCoordFromTags<RecordDim2, Tags>{});
	                if constexpr(SeparateBlobs)
	                    nrAndOffset.nr += Mapping1::blobCount;
	                else
	                {
	                    for(std::size_t i = 0; i < Mapping1::blobCount; i++)
	                        nrAndOffset.offset += mapping1.blobSize(i);
	                }
	                return nrAndOffset;
	            }
	        }

	        Mapping1 mapping1;
	        Mapping2 mapping2;
	    };

	    template<
	        typename RecordCoordsForMapping1,
	        template<typename...>
	        typename MappingTemplate1,
	        template<typename...>
	        typename MappingTemplate2,
	        bool SeparateBlobs = false>
	    struct PreconfiguredSplit
	    {
	        template<typename ArrayExtents, typename RecordDim>
	        using type = Split<
	            ArrayExtents,
	            RecordDim,
	            RecordCoordsForMapping1,
	            MappingTemplate1,
	            MappingTemplate2,
	            SeparateBlobs>;
	    };
	} // namespace llama::mapping
	// ==
	// == ./mapping/Split.hpp ==
	// ============================================================================

	// ============================================================================
	// == ./mapping/Trace.hpp ==
	// ==
	// #pragma once
	// #include "Common.hpp"    // amalgamate: file already expanded

	// #include <atomic>    // amalgamate: file already included
	// #include <iostream>    // amalgamate: file already included
	// #include <string>    // amalgamate: file already included
	#include <unordered_map>

	namespace llama::mapping
	{
	    /// Forwards all calls to the inner mapping. Traces all accesses made through this mapping and prints a summary on
	    /// destruction.
	    /// \tparam Mapping The type of the inner mapping.
	    template<typename Mapping>
	    struct Trace
	    {
	        using ArrayExtents = typename Mapping::ArrayExtents;
	        using ArrayIndex = typename Mapping::ArrayIndex;
	        using RecordDim = typename Mapping::RecordDim;
	        static constexpr std::size_t blobCount = Mapping::blobCount;

	        constexpr Trace() = default;

	        LLAMA_FN_HOST_ACC_INLINE
	        explicit Trace(Mapping mapping, bool printOnDestruction = true)
	            : mapping(mapping)
	            , printOnDestruction(printOnDestruction)
	        {
	            forEachLeafCoord<RecordDim>([&](auto rc) { fieldHits[recordCoordTags<RecordDim>(rc)] = 0; });
	        }

	        Trace(const Trace&) = delete;
	        auto operator=(const Trace&) -> Trace& = delete;

	        Trace(Trace&&) noexcept = default;
	        auto operator=(Trace&&) noexcept -> Trace& = default;

	        ~Trace()
	        {
	            if(printOnDestruction && !fieldHits.empty())
	                print();
	        }

	        LLAMA_FN_HOST_ACC_INLINE constexpr auto extents() const -> ArrayExtents
	        {
	            return mapping.extents();
	        }

	        LLAMA_FN_HOST_ACC_INLINE constexpr auto blobSize(std::size_t i) const -> std::size_t
	        {
	            LLAMA_FORCE_INLINE_RECURSIVE
	            return mapping.blobSize(i);
	        }

	        template<std::size_t... RecordCoords>
	        LLAMA_FN_HOST_ACC_INLINE auto blobNrAndOffset(ArrayIndex ai, RecordCoord<RecordCoords...> rc = {}) const
	            -> NrAndOffset
	        {
	            const static auto name = recordCoordTags<RecordDim>(RecordCoord<RecordCoords...>{});
	            fieldHits.at(name)++;

	            LLAMA_FORCE_INLINE_RECURSIVE return mapping.blobNrAndOffset(ai, rc);
	        }

	        void print() const
	        {
	            std::cout << "Trace mapping, number of accesses:\n";
	            for(const auto& [k, v] : fieldHits)
	                std::cout << '\t' << k << ":\t" << v << '\n';
	        }

	        Mapping mapping;
	        mutable std::unordered_map<std::string, std::atomic<std::size_t>> fieldHits;
	        bool printOnDestruction;
	    };
	} // namespace llama::mapping
	// ==
	// == ./mapping/Trace.hpp ==
	// ============================================================================

	// ============================================================================
	// == ./mapping/tree/Mapping.hpp ==
	// ==
	// Copyright 2018 Alexander Matthes
	// SPDX-License-Identifier: GPL-3.0-or-later

	// #pragma once
	// #include "../Common.hpp"    // amalgamate: file already expanded
		// ============================================================================
		// == ./mapping/tree/Functors.hpp ==
		// ==
		// Copyright 2018 Alexander Matthes
		// SPDX-License-Identifier: GPL-3.0-or-later

		// #pragma once
			// ============================================================================
			// == ./mapping/tree/TreeFromDimensions.hpp ==
			// ==
			// Copyright 2018 Alexander Matthes
			// SPDX-License-Identifier: GPL-3.0-or-later
			// #pragma once
			// #include "../../Core.hpp"    // amalgamate: file already expanded
				// ============================================================================
				// == ./Tuple.hpp ==
				// ==
				// Copyright 2018 Alexander Matthes
				// SPDX-License-Identifier: GPL-3.0-or-later

				// #pragma once
				// #include "Meta.hpp"    // amalgamate: file already expanded
				// #include "macros.hpp"    // amalgamate: file already expanded

				namespace llama
				{
				    template<typename... Elements>
				    struct Tuple
				    {
				    };

				    /// Tuple class like `std::tuple` but suitable for use with offloading devices like GPUs.
				    template<typename TFirstElement, typename... Elements>
				    struct Tuple<TFirstElement, Elements...>
				    {
				        using FirstElement = TFirstElement;
				        using RestTuple = Tuple<Elements...>;

				        constexpr Tuple() = default;

				        /// Construct a tuple from values of the same types as the tuple stores.
				        LLAMA_FN_HOST_ACC_INLINE constexpr explicit Tuple(FirstElement first, Elements... rest)
				            : first(std::move(first))
				            , rest(std::move(rest)...)
				        {
				        }

				        /// Construct a tuple from forwarded values of potentially different types as the tuple stores.
				        // SFINAE away this ctor if tuple elements cannot be constructed from ctor arguments
				        template<
				            typename T,
				            typename... Ts,
				            std::enable_if_t<
				                sizeof...(Elements) == sizeof...(Ts)
				                    && std::is_constructible_v<TFirstElement, T> && (std::is_constructible_v<Elements, Ts> && ...),
				                int> = 0>
				        LLAMA_FN_HOST_ACC_INLINE constexpr explicit Tuple(T&& firstArg, Ts&&... restArgs)
				            : first(std::forward<T>(firstArg))
				            , rest(std::forward<Ts>(restArgs)...)
				        {
				        }

				        FirstElement first; ///< the first element (if existing)
				#ifndef __NVCC__
				        [[no_unique_address]] // nvcc 11.3 ICE
				#endif
				        RestTuple rest; ///< the remaining elements
				    };

				    template<typename... Elements>
				    Tuple(Elements...) -> Tuple<std::remove_cv_t<std::remove_reference_t<Elements>>...>;

				    template<std::size_t Pos, typename... Elements>
				    LLAMA_FN_HOST_ACC_INLINE constexpr auto get(Tuple<Elements...>& tuple) -> auto&
				    {
				        if constexpr(Pos == 0)
				            return tuple.first;
				        else
				            return get<Pos - 1>(tuple.rest);
				    }

				    template<std::size_t Pos, typename... Elements>
				    LLAMA_FN_HOST_ACC_INLINE constexpr auto get(const Tuple<Elements...>& tuple) -> const auto&
				    {
				        if constexpr(Pos == 0)
				            return tuple.first;
				        else
				            return get<Pos - 1>(tuple.rest);
				    }
				} // namespace llama

				template<typename... Elements>
				struct std::tuple_size<llama::Tuple<Elements...>>
				{
				    static constexpr auto value = sizeof...(Elements);
				};

				template<std::size_t I, typename... Elements>
				struct std::tuple_element<I, llama::Tuple<Elements...>>
				{
				    using type = boost::mp11::mp_at_c<llama::Tuple<Elements...>, I>;
				};

				namespace llama
				{
				    namespace internal
				    {
				        template<typename... Elements, std::size_t... Is>
				        LLAMA_FN_HOST_ACC_INLINE constexpr auto areEqual(
				            const Tuple<Elements...>& a,
				            const Tuple<Elements...>& b,
				            std::index_sequence<Is...>) -> bool
				        {
				            return ((get<Is>(a) == get<Is>(b)) && ...);
				        }
				    } // namespace internal

				    template<typename... ElementsA, typename... ElementsB>
				    LLAMA_FN_HOST_ACC_INLINE constexpr auto operator==(const Tuple<ElementsA...>& a, const Tuple<ElementsB...>& b)
				        -> bool
				    {
				        using namespace boost::mp11;
				        if constexpr(sizeof...(ElementsA) == sizeof...(ElementsB))
				            if constexpr(mp_apply<mp_all, mp_transform<std::is_same, mp_list<ElementsA...>, mp_list<ElementsB...>>>::
				                             value)
				                return internal::areEqual(a, b, std::make_index_sequence<sizeof...(ElementsA)>{});
				        return false;
				    }

				    template<typename... ElementsA, typename... ElementsB>
				    LLAMA_FN_HOST_ACC_INLINE constexpr auto operator!=(const Tuple<ElementsA...>& a, const Tuple<ElementsB...>& b)
				        -> bool
				    {
				        return !(a == b);
				    }

				    namespace internal
				    {
				        template<typename Tuple1, typename Tuple2, size_t... Is1, size_t... Is2>
				        LLAMA_FN_HOST_ACC_INLINE constexpr auto tupleCatImpl(
				            const Tuple1& t1,
				            const Tuple2& t2,
				            std::index_sequence<Is1...>,
				            std::index_sequence<Is2...>)
				        {
				            return Tuple{get<Is1>(t1)..., get<Is2>(t2)...};
				        }
				    } // namespace internal

				    template<typename Tuple1, typename Tuple2>
				    LLAMA_FN_HOST_ACC_INLINE constexpr auto tupleCat(const Tuple1& t1, const Tuple2& t2)
				    {
				        return internal::tupleCatImpl(
				            t1,
				            t2,
				            std::make_index_sequence<std::tuple_size_v<Tuple1>>{},
				            std::make_index_sequence<std::tuple_size_v<Tuple2>>{});
				    }

				    namespace internal
				    {
				        template<std::size_t Pos, typename Tuple, typename Replacement>
				        struct TupleReplaceImpl
				        {
				            LLAMA_FN_HOST_ACC_INLINE
				            auto operator()(Tuple const tuple, Replacement const replacement)
				            {
				                return tupleCat(
				                    llama::Tuple{tuple.first},
				                    TupleReplaceImpl<Pos - 1, typename Tuple::RestTuple, Replacement>()(tuple.rest, replacement));
				            };
				        };

				        template<typename... Elements, typename Replacement>
				        struct TupleReplaceImpl<0, Tuple<Elements...>, Replacement>
				        {
				            LLAMA_FN_HOST_ACC_INLINE
				            auto operator()(Tuple<Elements...> tuple, Replacement const replacement)
				            {
				                return tupleCat(Tuple{replacement}, tuple.rest);
				            };
				        };

				        template<typename OneElement, typename Replacement>
				        struct TupleReplaceImpl<0, Tuple<OneElement>, Replacement>
				        {
				            LLAMA_FN_HOST_ACC_INLINE
				            auto operator()(Tuple<OneElement>, Replacement const replacement)
				            {
				                return Tuple{replacement};
				            }
				        };
				    } // namespace internal

				    /// Creates a copy of a tuple with the element at position Pos replaced by replacement.
				    template<std::size_t Pos, typename Tuple, typename Replacement>
				    LLAMA_FN_HOST_ACC_INLINE auto tupleReplace(Tuple tuple, Replacement replacement)
				    {
				        return internal::TupleReplaceImpl<Pos, Tuple, Replacement>()(tuple, replacement);
				    }

				    namespace internal
				    {
				        template<size_t... Is, typename... Elements, typename Functor>
				        LLAMA_FN_HOST_ACC_INLINE constexpr auto tupleTransformHelper(
				            std::index_sequence<Is...>,
				            const Tuple<Elements...>& tuple,
				            const Functor& functor)
				        {
				            // FIXME(bgruber): nvcc fails to compile
				            // Tuple{functor(get<Is>(tuple))...}
				            return Tuple<decltype(functor(std::declval<Elements>()))...>{functor(get<Is>(tuple))...};
				        }
				    } // namespace internal

				    /// Applies a functor to every element of a tuple, creating a new tuple with the result of the element
				    /// transformations. The functor needs to implement a template `operator()` to which all tuple elements are passed.
				    // TODO(bgruber): replace by mp11 version in Boost 1.74.
				    template<typename... Elements, typename Functor>
				    LLAMA_FN_HOST_ACC_INLINE constexpr auto tupleTransform(const Tuple<Elements...>& tuple, const Functor& functor)
				    {
				        return internal::tupleTransformHelper(std::make_index_sequence<sizeof...(Elements)>{}, tuple, functor);
				    }

				    /// Returns a copy of the tuple without the first element.
				    template<typename... Elements>
				    LLAMA_FN_HOST_ACC_INLINE constexpr auto pop_front(const Tuple<Elements...>& tuple)
				    {
				        return tuple.rest;
				    }
				} // namespace llama
				// ==
				// == ./Tuple.hpp ==
				// ============================================================================


			// #include <cstddef>    // amalgamate: file already included
			// #include <string>    // amalgamate: file already included
			// #include <type_traits>    // amalgamate: file already included

			namespace llama::mapping::tree
			{
			    template<typename T>
			    inline constexpr auto one = 1;

			    template<>
			    inline constexpr auto one<boost::mp11::mp_size_t<1>> = boost::mp11::mp_size_t<1>{};

			    template<typename TIdentifier, typename TType, typename CountType = std::size_t>
			    struct Leaf
			    {
			        using Identifier = TIdentifier;
			        using Type = TType;

			        const CountType count = one<CountType>;
			    };

			    template<typename TIdentifier, typename TChildrenTuple, typename CountType = std::size_t>
			    struct Node
			    {
			        using Identifier = TIdentifier;
			        using ChildrenTuple = TChildrenTuple;

			        const CountType count = one<CountType>;
			        const ChildrenTuple childs = {};
			    };

			    template<std::size_t ChildIndex = 0, typename ArrayIndexType = std::size_t>
			    struct TreeCoordElement
			    {
			        static constexpr boost::mp11::mp_size_t<ChildIndex> childIndex = {};
			        const ArrayIndexType arrayIndex = {};
			    };

			    template<std::size_t... Coords>
			    using TreeCoord = Tuple<TreeCoordElement<Coords, boost::mp11::mp_size_t<0>>...>;

			    namespace internal
			    {
			        template<typename... Coords, std::size_t... Is>
			        auto treeCoordToString(Tuple<Coords...> treeCoord, std::index_sequence<Is...>) -> std::string
			        {
			            auto s
			                = ((std::to_string(get<Is>(treeCoord).arrayIndex) + ":" + std::to_string(get<Is>(treeCoord).childIndex)
			                    + ", ")
			                   + ...);
			            s.resize(s.length() - 2);
			            return s;
			        }
			    } // namespace internal

			    template<typename TreeCoord>
			    auto treeCoordToString(TreeCoord treeCoord) -> std::string
			    {
			        return std::string("[ ")
			            + internal::treeCoordToString(treeCoord, std::make_index_sequence<std::tuple_size_v<TreeCoord>>{})
			            + std::string(" ]");
			    }

			    namespace internal
			    {
			        template<typename Tag, typename RecordDim, typename CountType>
			        struct CreateTreeElement
			        {
			            using type = Leaf<Tag, RecordDim, boost::mp11::mp_size_t<1>>;
			        };

			        template<typename Tag, typename... Fields, typename CountType>
			        struct CreateTreeElement<Tag, Record<Fields...>, CountType>
			        {
			            using type = Node<
			                Tag,
			                Tuple<
			                    typename CreateTreeElement<GetFieldTag<Fields>, GetFieldType<Fields>, boost::mp11::mp_size_t<1>>::
			                        type...>,
			                CountType>;
			        };

			        template<typename Tag, typename ChildType, std::size_t Count, typename CountType>
			        struct CreateTreeElement<Tag, ChildType[Count], CountType>
			        {
			            template<std::size_t... Is>
			            static auto createChildren(std::index_sequence<Is...>)
			            {
			                return Tuple<
			                    typename CreateTreeElement<RecordCoord<Is>, ChildType, boost::mp11::mp_size_t<1>>::type...>{};
			            }

			            using type = Node<Tag, decltype(createChildren(std::make_index_sequence<Count>{})), CountType>;
			        };

			        template<typename Leaf, std::size_t Count>
			        struct WrapInNNodes
			        {
			            using type = Node<NoName, Tuple<typename WrapInNNodes<Leaf, Count - 1>::type>>;
			        };

			        template<typename Leaf>
			        struct WrapInNNodes<Leaf, 0>
			        {
			            using type = Leaf;
			        };

			        template<typename RecordDim>
			        using TreeFromRecordDimImpl = typename CreateTreeElement<NoName, RecordDim, std::size_t>::type;
			    } // namespace internal

			    template<typename RecordDim>
			    using TreeFromRecordDim = internal::TreeFromRecordDimImpl<RecordDim>;

			    template<typename ArrayExtents, typename RecordDim>
			    using TreeFromDimensions =
			        typename internal::WrapInNNodes<internal::TreeFromRecordDimImpl<RecordDim>, ArrayExtents::rank - 1>::type;

			    template<typename RecordDim, std::size_t N, std::size_t Pos = 0>
			    LLAMA_FN_HOST_ACC_INLINE auto createTree(const ArrayIndex<N>& size)
			    {
			        if constexpr(Pos == N - 1)
			            return TreeFromRecordDim<RecordDim>{size[N - 1]};
			        else
			        {
			            Tuple inner{createTree<RecordDim, N, Pos + 1>(size)};
			            return Node<NoName, decltype(inner)>{size[Pos], inner};
			        }
			    };

			    namespace internal
			    {
			        template<
			            typename ArrayIndex,
			            std::size_t... ADIndices,
			            std::size_t FirstRecordCoord,
			            std::size_t... RecordCoords>
			        LLAMA_FN_HOST_ACC_INLINE auto createTreeCoord(
			            const ArrayIndex& ai,
			            std::index_sequence<ADIndices...>,
			            RecordCoord<FirstRecordCoord, RecordCoords...>)
			        {
			            return Tuple{
			                TreeCoordElement<(ADIndices == ArrayIndex::rank - 1 ? FirstRecordCoord : 0)>{ai[ADIndices]}...,
			                TreeCoordElement<RecordCoords, boost::mp11::mp_size_t<0>>{}...,
			                TreeCoordElement<0, boost::mp11::mp_size_t<0>>{}};
			        }
			    } // namespace internal

			    template<typename RecordCoord, typename ArrayIndex>
			    LLAMA_FN_HOST_ACC_INLINE auto createTreeCoord(const ArrayIndex& ai)
			    {
			        return internal::createTreeCoord(ai, std::make_index_sequence<ArrayIndex::rank>{}, RecordCoord{});
			    }
			} // namespace llama::mapping::tree
			// ==
			// == ./mapping/tree/TreeFromDimensions.hpp ==
			// ============================================================================


		namespace llama::mapping::tree::functor
		{
		    /// Functor for \ref tree::Mapping. Does nothing with the mapping tree. Is used for testing.
		    struct Idem
		    {
		        template<typename Tree>
		        LLAMA_FN_HOST_ACC_INLINE auto basicToResult(const Tree& tree) const -> Tree
		        {
		            return tree;
		        }

		        template<typename Tree, typename TreeCoord>
		        LLAMA_FN_HOST_ACC_INLINE auto basicCoordToResultCoord(const TreeCoord& basicCoord, const Tree&) const
		            -> TreeCoord
		        {
		            return basicCoord;
		        }

		        template<typename Tree, typename TreeCoord>
		        LLAMA_FN_HOST_ACC_INLINE auto resultCoordToBasicCoord(const TreeCoord& resultCoord, const Tree&) const
		            -> TreeCoord
		        {
		            return resultCoord;
		        }
		    };

		    /// Functor for \ref tree::Mapping. Moves all run time parts to the leaves, creating a SoA layout.
		    struct LeafOnlyRT
		    {
		        template<typename Tree>
		        LLAMA_FN_HOST_ACC_INLINE auto basicToResult(Tree tree) const
		        {
		            return basicToResultImpl(tree, 1);
		        }

		        template<typename Tree, typename BasicCoord>
		        LLAMA_FN_HOST_ACC_INLINE auto basicCoordToResultCoord(const BasicCoord& basicCoord, const Tree& tree) const
		        {
		            return basicCoordToResultCoordImpl(basicCoord, tree);
		        }

		        template<typename Tree, typename ResultCoord>
		        LLAMA_FN_HOST_ACC_INLINE auto resultCoordToBasicCoord(const ResultCoord& resultCoord, const Tree& /*tree*/)
		            const -> ResultCoord
		        {
		            return resultCoord;
		        }

		    private:
		        template<typename Identifier, typename Type, typename CountType>
		        LLAMA_FN_HOST_ACC_INLINE static auto basicToResultImpl(
		            const Node<Identifier, Type, CountType>& node,
		            std::size_t arraySize)
		        {
		            auto children = tupleTransform(
		                node.childs,
		                [&](auto element) { return basicToResultImpl(element, LLAMA_COPY(node.count) * arraySize); });
		            return Node<Identifier, decltype(children), boost::mp11::mp_size_t<1>>{{}, children};
		        }

		        template<typename Identifier, typename Type, typename CountType>
		        LLAMA_FN_HOST_ACC_INLINE static auto basicToResultImpl(
		            const Leaf<Identifier, Type, CountType>& leaf,
		            std::size_t arraySize)
		        {
		            return Leaf<Identifier, Type, std::size_t>{LLAMA_COPY(leaf.count) * arraySize};
		        }

		        template<typename BasicCoord, typename NodeOrLeaf>
		        LLAMA_FN_HOST_ACC_INLINE static auto basicCoordToResultCoordImpl(
		            const BasicCoord& basicCoord,
		            const NodeOrLeaf& nodeOrLeaf,
		            std::size_t arraySize = 0)
		        {
		            if constexpr(std::tuple_size_v<BasicCoord> == 1)
		                return Tuple{TreeCoordElement<BasicCoord::FirstElement::childIndex>{
		                    arraySize + LLAMA_COPY(basicCoord.first.arrayIndex)}};
		            else
		            {
		                const auto& branch = get<BasicCoord::FirstElement::childIndex>(nodeOrLeaf.childs);
		                auto first = TreeCoordElement<BasicCoord::FirstElement::childIndex, boost::mp11::mp_size_t<0>>{};

		                return tupleCat(
		                    Tuple{first},
		                    basicCoordToResultCoordImpl(
		                        basicCoord.rest,
		                        branch,
		                        (arraySize + LLAMA_COPY(basicCoord.first.arrayIndex)) * LLAMA_COPY(branch.count)));
		            }
		        }
		    };

		    namespace internal
		    {
		        template<typename TreeCoord, typename Node>
		        LLAMA_FN_HOST_ACC_INLINE auto getNode(const Node& node)
		        {
		            if constexpr(std::is_same_v<TreeCoord, Tuple<>>)
		                return node;
		            else
		                return getNode<typename TreeCoord::RestTuple>(get<TreeCoord::FirstElement::childIndex>(node.childs));
		        }

		        template<typename TreeCoord, typename Identifier, typename Type, typename CountType>
		        LLAMA_FN_HOST_ACC_INLINE auto changeNodeRuntime(
		            const Node<Identifier, Type, CountType>& tree,
		            std::size_t newValue)
		        {
		            if constexpr(std::is_same_v<TreeCoord, Tuple<>>)
		                return Node<Identifier, Type>{newValue, tree.childs};
		            else
		            {
		                auto current = get<TreeCoord::FirstElement::childIndex>(tree.childs);
		                auto replacement = changeNodeRuntime<typename TreeCoord::RestTuple>(current, newValue);
		                auto children = tupleReplace<TreeCoord::FirstElement::childIndex>(tree.childs, replacement);
		                return Node<Identifier, decltype(children)>{tree.count, children};
		            }
		        }

		        template<typename TreeCoord, typename Identifier, typename Type, typename CountType>
		        LLAMA_FN_HOST_ACC_INLINE auto changeNodeRuntime(
		            const Leaf<Identifier, Type, CountType>& /*tree*/,
		            std::size_t newValue)
		        {
		            return Leaf<Identifier, Type, std::size_t>{newValue};
		        }

		        struct ChangeNodeChildsRuntimeFunctor
		        {
		            const std::size_t newValue;

		            template<typename Identifier, typename Type, typename CountType>
		            LLAMA_FN_HOST_ACC_INLINE auto operator()(const Node<Identifier, Type, CountType>& element) const
		            {
		                return Node<Identifier, Type, std::size_t>{element.count * newValue, element.childs};
		            }

		            template<typename Identifier, typename Type, typename CountType>
		            LLAMA_FN_HOST_ACC_INLINE auto operator()(const Leaf<Identifier, Type, CountType>& element) const
		            {
		                return Leaf<Identifier, Type, std::size_t>{element.count * newValue};
		            }
		        };

		        template<typename TreeCoord, typename Identifier, typename Type, typename CountType>
		        LLAMA_FN_HOST_ACC_INLINE auto changeNodeChildsRuntime(
		            const Node<Identifier, Type, CountType>& tree,
		            std::size_t newValue)
		        {
		            if constexpr(std::is_same_v<TreeCoord, Tuple<>>)
		            {
		                auto children = tupleTransform(tree.childs, ChangeNodeChildsRuntimeFunctor{newValue});
		                return Node<Identifier, decltype(children)>{tree.count, children};
		            }
		            else
		            {
		                auto current = get<TreeCoord::FirstElement::childIndex>(tree.childs);
		                auto replacement = changeNodeChildsRuntime<typename TreeCoord::RestTuple>(current, newValue);
		                auto children = tupleReplace<TreeCoord::FirstElement::childIndex>(tree.childs, replacement);
		                return Node<Identifier, decltype(children)>{tree.count, children};
		            }
		        }

		        template<typename TreeCoord, typename Identifier, typename Type, typename CountType>
		        LLAMA_FN_HOST_ACC_INLINE auto changeNodeChildsRuntime(
		            const Leaf<Identifier, Type, CountType>& tree,
		            std::size_t /*newValue*/)
		        {
		            return tree;
		        }
		    } // namespace internal

		    /// Functor for \ref tree::Mapping. Move the run time part of a node one level down in direction of the leaves by
		    /// the given amount (runtime or compile time value).
		    /// \tparam TreeCoord tree coordinate in the mapping tree which's run time part shall be moved down one level
		    /// \see tree::Mapping
		    template<typename TreeCoord, typename Amount = std::size_t>
		    struct MoveRTDown
		    {
		        const Amount amount = {};

		        template<typename Tree>
		        LLAMA_FN_HOST_ACC_INLINE auto basicToResult(const Tree& tree) const
		        {
		            return internal::changeNodeChildsRuntime<TreeCoord>(
		                internal::changeNodeRuntime<TreeCoord>(
		                    tree,
		                    // NOLINTNEXTLINE(clang-analyzer-core.DivideZero)
		                    (internal::getNode<TreeCoord>(tree).count + amount - 1) / amount),
		                amount);
		        }

		        template<typename Tree, typename BasicCoord>
		        LLAMA_FN_HOST_ACC_INLINE auto basicCoordToResultCoord(const BasicCoord& basicCoord, const Tree& tree) const
		        {
		            return basicCoordToResultCoordImpl<TreeCoord>(basicCoord, tree);
		        }

		        template<typename Tree, typename ResultCoord>
		        LLAMA_FN_HOST_ACC_INLINE auto resultCoordToBasicCoord(const ResultCoord& resultCoord, const Tree&) const
		            -> ResultCoord
		        {
		            return resultCoord;
		        }

		    private:
		        template<typename InternalTreeCoord, typename BasicCoord, typename Tree>
		        LLAMA_FN_HOST_ACC_INLINE auto basicCoordToResultCoordImpl(const BasicCoord& basicCoord, const Tree& tree) const
		        {
		            if constexpr(std::is_same_v<InternalTreeCoord, Tuple<>>)
		            {
		                if constexpr(std::is_same_v<BasicCoord, Tuple<>>)
		                    return Tuple{};
		                else
		                {
		                    const auto& childTree = get<BasicCoord::FirstElement::childIndex>(tree.childs);
		                    const auto rt1 = basicCoord.first.arrayIndex / amount;
		                    const auto rt2
		                        = basicCoord.first.arrayIndex % amount * childTree.count + basicCoord.rest.first.arrayIndex;
		                    auto rt1Child = TreeCoordElement<BasicCoord::FirstElement::childIndex>{rt1};
		                    auto rt2Child = TreeCoordElement<BasicCoord::RestTuple::FirstElement::childIndex>{rt2};
		                    return tupleCat(Tuple{rt1Child}, tupleCat(Tuple{rt2Child}, pop_front(basicCoord.rest)));
		                }
		            }
		            else
		            {
		                if constexpr(InternalTreeCoord::FirstElement::childIndex != BasicCoord::FirstElement::childIndex)
		                    return basicCoord;
		                else
		                {
		                    auto rest = basicCoordToResultCoordImpl<typename InternalTreeCoord::RestTuple>(
		                        pop_front(basicCoord),
		                        get<BasicCoord::FirstElement::childIndex>(tree.childs));
		                    return tupleCat(Tuple{basicCoord.first}, rest);
		                }
		            }
		        }
		    };

		    template<typename TreeCoord, std::size_t Amount>
		    using MoveRTDownFixed = MoveRTDown<TreeCoord, boost::mp11::mp_size_t<Amount>>;
		} // namespace llama::mapping::tree::functor
		// ==
		// == ./mapping/tree/Functors.hpp ==
		// ============================================================================

	// #include "TreeFromDimensions.hpp"    // amalgamate: file already expanded
		// ============================================================================
		// == ./mapping/tree/toString.hpp ==
		// ==
		// Copyright 2018 Alexander Matthes
		// SPDX-License-Identifier: GPL-3.0-or-later

		// #pragma once
		// #include "TreeFromDimensions.hpp"    // amalgamate: file already expanded

		// #include <boost/core/demangle.hpp>    // amalgamate: file already included
		// #include <string>    // amalgamate: file already included
		#include <typeinfo>

		namespace llama::mapping::tree
		{
		    template<typename T>
		    auto toString(T) -> std::string
		    {
		        return "Unknown";
		    }

		    // handles array indices
		    template<std::size_t I>
		    inline auto toString(RecordCoord<I>) -> std::string
		    {
		        return "";
		    }

		    inline auto toString(NoName) -> std::string
		    {
		        return "";
		    }

		    template<typename... Elements>
		    auto toString(Tuple<Elements...> tree) -> std::string
		    {
		        if constexpr(sizeof...(Elements) > 1)
		            return toString(tree.first) + " , " + toString(tree.rest);
		        else
		            return toString(tree.first);
		    }

		    namespace internal
		    {
		        inline void replace_all(std::string& str, const std::string& search, const std::string& replace)
		        {
		            std::string::size_type i = 0;
		            while((i = str.find(search, i)) != std::string::npos)
		            {
		                str.replace(i, search.length(), replace);
		                i += replace.length();
		            }
		        }

		        template<typename NodeOrLeaf>
		        auto countAndIdentToString(const NodeOrLeaf& nodeOrLeaf) -> std::string
		        {
		            auto r = std::to_string(nodeOrLeaf.count);
		            if constexpr(std::is_same_v<std::decay_t<decltype(nodeOrLeaf.count)>, std::size_t>)
		                r += "R"; // runtime
		            else
		                r += "C"; // compile time
		            r += std::string{" * "} + toString(typename NodeOrLeaf::Identifier{});
		            return r;
		        }
		    } // namespace internal

		    template<typename Identifier, typename Type, typename CountType>
		    auto toString(const Node<Identifier, Type, CountType>& node) -> std::string
		    {
		        return internal::countAndIdentToString(node) + "[ " + toString(node.childs) + " ]";
		    }

		    template<typename Identifier, typename Type, typename CountType>
		    auto toString(const Leaf<Identifier, Type, CountType>& leaf) -> std::string
		    {
		        auto raw = boost::core::demangle(typeid(Type).name());
		#ifdef _MSC_VER
		        internal::replace_all(raw, " __cdecl(void)", "");
		#endif
		#ifdef __GNUG__
		        internal::replace_all(raw, " ()", "");
		#endif
		        return internal::countAndIdentToString(leaf) + "(" + raw + ")";
		    }
		} // namespace llama::mapping::tree
		// ==
		// == ./mapping/tree/toString.hpp ==
		// ============================================================================


	// #include <type_traits>    // amalgamate: file already included

	namespace llama::mapping::tree
	{
	    namespace internal
	    {
	        template<typename Tree, typename TreeOperationList>
	        struct MergeFunctors
	        {
	        };

	        template<typename Tree, typename... Operations>
	        struct MergeFunctors<Tree, Tuple<Operations...>>
	        {
	            boost::mp11::mp_first<Tuple<Operations...>> operation = {};
	            using ResultTree = decltype(operation.basicToResult(Tree()));
	            ResultTree treeAfterOp;
	            MergeFunctors<ResultTree, boost::mp11::mp_drop_c<Tuple<Operations...>, 1>> next = {};

	            MergeFunctors() = default;

	            LLAMA_FN_HOST_ACC_INLINE
	            MergeFunctors(const Tree& tree, const Tuple<Operations...>& treeOperationList)
	                : operation(treeOperationList.first)
	                , treeAfterOp(operation.basicToResult(tree))
	                , next(treeAfterOp, pop_front(treeOperationList))
	            {
	            }

	            LLAMA_FN_HOST_ACC_INLINE
	            auto basicToResult(const Tree& tree) const
	            {
	                if constexpr(sizeof...(Operations) > 1)
	                    return next.basicToResult(treeAfterOp);
	                else if constexpr(sizeof...(Operations) == 1)
	                    return operation.basicToResult(tree);
	                else
	                    return tree;
	            }

	            template<typename TreeCoord>
	            LLAMA_FN_HOST_ACC_INLINE auto basicCoordToResultCoord(const TreeCoord& basicCoord, const Tree& tree) const
	            {
	                if constexpr(sizeof...(Operations) >= 1)
	                    return next.basicCoordToResultCoord(
	                        operation.basicCoordToResultCoord(basicCoord, tree),
	                        treeAfterOp);
	                else
	                    return basicCoord;
	            }

	            template<typename TreeCoord>
	            LLAMA_FN_HOST_ACC_INLINE auto resultCoordToBasicCoord(const TreeCoord& resultCoord, const Tree& tree) const
	            {
	                if constexpr(sizeof...(Operations) >= 1)
	                    return next.resultCoordToBasicCoord(
	                        operation.resultCoordToBasicCoord(resultCoord, tree),
	                        operation.basicToResult(tree));
	                else
	                    return resultCoord;
	            }
	        };

	        template<typename Tree>
	        struct MergeFunctors<Tree, Tuple<>>
	        {
	            MergeFunctors() = default;

	            LLAMA_FN_HOST_ACC_INLINE
	            MergeFunctors(const Tree&, const Tuple<>&)
	            {
	            }

	            LLAMA_FN_HOST_ACC_INLINE
	            auto basicToResult(const Tree& tree) const
	            {
	                return tree;
	            }

	            template<typename TreeCoord>
	            LLAMA_FN_HOST_ACC_INLINE auto basicCoordToResultCoord(TreeCoord const& basicCoord, Tree const& /*tree*/)
	                const -> TreeCoord
	            {
	                return basicCoord;
	            }

	            template<typename TreeCoord>
	            LLAMA_FN_HOST_ACC_INLINE auto resultCoordToBasicCoord(TreeCoord const& resultCoord, Tree const& /*tree*/)
	                const -> TreeCoord
	            {
	                return resultCoord;
	            }
	        };

	        template<typename Identifier, typename Type, typename CountType>
	        LLAMA_FN_HOST_ACC_INLINE auto getTreeBlobSize(const Node<Identifier, Type, CountType>& node) -> std::size_t;

	        template<typename Identifier, typename Type, typename CountType>
	        LLAMA_FN_HOST_ACC_INLINE auto getTreeBlobSize(const Leaf<Identifier, Type, CountType>& leaf) -> std::size_t;

	        template<typename... Children, std::size_t... Is, typename Count>
	        LLAMA_FN_HOST_ACC_INLINE auto getChildrenBlobSize(
	            const Tuple<Children...>& childs,
	            std::index_sequence<Is...> /*ii*/,
	            const Count& count) -> std::size_t
	        {
	            return count * (getTreeBlobSize(get<Is>(childs)) + ...);
	        }

	        template<typename Identifier, typename Type, typename CountType>
	        LLAMA_FN_HOST_ACC_INLINE auto getTreeBlobSize(const Node<Identifier, Type, CountType>& node) -> std::size_t
	        {
	            constexpr std::size_t childCount = boost::mp11::mp_size<std::decay_t<decltype(node.childs)>>::value;
	            return getChildrenBlobSize(node.childs, std::make_index_sequence<childCount>{}, LLAMA_COPY(node.count));
	        }

	        template<typename Identifier, typename Type, typename CountType>
	        LLAMA_FN_HOST_ACC_INLINE auto getTreeBlobSize(const Leaf<Identifier, Type, CountType>& leaf) -> std::size_t
	        {
	            return leaf.count * sizeof(Type);
	        }

	        template<typename Childs, typename CountType>
	        LLAMA_FN_HOST_ACC_INLINE auto getTreeBlobSize(const Childs& childs, const CountType& count) -> std::size_t
	        {
	            return getTreeBlobSize(Node<NoName, Childs, CountType>{count, childs});
	        }

	        template<std::size_t MaxPos, typename Identifier, typename Type, typename CountType, std::size_t... Is>
	        LLAMA_FN_HOST_ACC_INLINE auto sumChildrenSmallerThan(
	            const Node<Identifier, Type, CountType>& node,
	            std::index_sequence<Is...>) -> std::size_t
	        {
	            return ((getTreeBlobSize(get<Is>(node.childs)) * (Is < MaxPos)) + ...);
	        }

	        template<typename Tree, typename... Coords>
	        LLAMA_FN_HOST_ACC_INLINE auto getTreeBlobByte(const Tree& tree, const Tuple<Coords...>& treeCoord)
	            -> std::size_t
	        {
	            const auto firstArrayIndex = treeCoord.first.arrayIndex;
	            if constexpr(sizeof...(Coords) > 1)
	            {
	                constexpr auto firstChildIndex = decltype(treeCoord.first.childIndex)::value;
	                return getTreeBlobSize(tree.childs, firstArrayIndex)
	                    + sumChildrenSmallerThan<firstChildIndex>(
	                           tree,
	                           std::make_index_sequence<std::tuple_size_v<typename Tree::ChildrenTuple>>{})
	                    + getTreeBlobByte(get<firstChildIndex>(tree.childs), treeCoord.rest);
	            }
	            else
	                return sizeof(typename Tree::Type) * firstArrayIndex;
	        }
	    } // namespace internal

	    /// An experimental attempt to provide a general purpose description of a mapping. \ref Array and record
	    /// dimensions are represented by a compile time tree data structure. This tree is mapped into memory by means of a
	    /// breadth-first tree traversal. By specifying additional tree operations, the tree can be modified at compile
	    /// time before being mapped to memory.
	    template<typename TArrayExtents, typename TRecordDim, typename TreeOperationList>
	    struct Mapping : private TArrayExtents
	    {
	        using ArrayExtents = TArrayExtents;
	        using ArrayIndex = typename ArrayExtents::Index;
	        using RecordDim = TRecordDim;
	        using BasicTree = TreeFromDimensions<ArrayExtents, RecordDim>;
	        // TODO(bgruber): , support more than one blob
	        static constexpr std::size_t blobCount = 1;

	        using MergedFunctors = internal::MergeFunctors<BasicTree, TreeOperationList>;

	        BasicTree basicTree;
	        MergedFunctors mergedFunctors;

	        using ResultTree = decltype(mergedFunctors.basicToResult(basicTree));
	        ResultTree resultTree;

	        Mapping() = default;

	        LLAMA_FN_HOST_ACC_INLINE
	        Mapping(ArrayExtents extents, TreeOperationList treeOperationList, RecordDim = {})
	            : ArrayExtents(extents)
	            , basicTree(createTree<RecordDim>(extents.toArray()))
	            , mergedFunctors(basicTree, treeOperationList)
	            , resultTree(mergedFunctors.basicToResult(basicTree))
	        {
	        }

	        LLAMA_FN_HOST_ACC_INLINE auto extents() const -> ArrayExtents
	        {
	            return ArrayExtents{*this};
	        }

	        LLAMA_FN_HOST_ACC_INLINE
	        auto blobSize(std::size_t const) const -> std::size_t
	        {
	            return internal::getTreeBlobSize(resultTree);
	        }

	        template<std::size_t... RecordCoords>
	        LLAMA_FN_HOST_ACC_INLINE auto blobNrAndOffset(ArrayIndex ai, RecordCoord<RecordCoords...> = {}) const
	            -> NrAndOffset
	        {
	            auto const basicTreeCoord = createTreeCoord<RecordCoord<RecordCoords...>>(ai);
	            auto const resultTreeCoord = mergedFunctors.basicCoordToResultCoord(basicTreeCoord, basicTree);
	            const auto offset = internal::getTreeBlobByte(resultTree, resultTreeCoord);
	            return {0, offset};
	        }
	    };
	} // namespace llama::mapping::tree
	// ==
	// == ./mapping/tree/Mapping.hpp ==
	// ============================================================================


#ifdef __NVCC__
#    pragma pop
#endif
// ==
// == ./llama.hpp ==
// ============================================================================