Last active
May 13, 2026 18:55
-
-
Save ChrisKitching/53d9b3503c11881c5ea395ec41aff419 to your computer and use it in GitHub Desktop.
Managed memory microbenchmark (HIP vs. SCALE)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #ifdef __HIP__ | |
| #include <hip/hip_runtime.h> | |
| #define cudaError_t hipError_t | |
| #define cudaSuccess hipSuccess | |
| #define cudaMalloc hipMalloc | |
| #define cudaMallocManaged hipMallocManaged | |
| #define cudaDeviceSynchronize hipDeviceSynchronize | |
| #endif | |
| #include <linux/mman.h> | |
| #include <sys/mman.h> | |
| #include <chrono> | |
| #include <cstdio> | |
| using T = float; | |
| //constexpr unsigned int blockSize = 1 << 10; | |
| //constexpr unsigned int numBlocks = 1 << 10; | |
| //constexpr size_t eltsPerThread = 1 << 6; | |
| constexpr unsigned int blockSize = 1 << 10; | |
| constexpr unsigned int numBlocks = 1 << 8; | |
| constexpr size_t eltsPerThread = 1 << 12; | |
| constexpr size_t bytes = blockSize * numBlocks * eltsPerThread * sizeof(T); | |
| __global__ void warmup(){} | |
| __global__ void k(T *dst, const T *src) | |
| { | |
| size_t tid = blockIdx.x * blockSize + threadIdx.x; | |
| T acc = 0; | |
| for (unsigned int i = 0; i < eltsPerThread; i++) { | |
| acc += src[tid * eltsPerThread + i]; | |
| } | |
| dst[tid] = acc; | |
| } | |
| void check(cudaError_t e) | |
| { | |
| if (e != cudaSuccess) { | |
| abort(); | |
| } | |
| } | |
| int main() | |
| { | |
| printf("Dimensions: %u, %u, %zu\n", blockSize, numBlocks, eltsPerThread); | |
| T *dst = nullptr; | |
| T *src = nullptr; | |
| warmup<<<1, 1>>>(); | |
| check(cudaDeviceSynchronize()); | |
| check(cudaMalloc(&dst, blockSize * numBlocks * sizeof(T))); | |
| check(cudaMallocManaged(&src, bytes)); | |
| { | |
| printf("Starting source data generation to %p (0x%llx bytes).\n", (void *)src, (unsigned long long)bytes); | |
| auto start = std::chrono::high_resolution_clock::now(); | |
| for (size_t i = 0; i < blockSize * numBlocks * eltsPerThread; i++) { | |
| src[i] = i; | |
| } | |
| auto end = std::chrono::high_resolution_clock::now(); | |
| printf("Source number generation took %0.03f ms.\n", std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count() * 1e-6); | |
| } | |
| for (unsigned int i = 0; i < 20; i++) { | |
| auto start = std::chrono::high_resolution_clock::now(); | |
| k<<<numBlocks, blockSize>>>(dst, src); | |
| check(cudaDeviceSynchronize()); | |
| auto end = std::chrono::high_resolution_clock::now(); | |
| int64_t ns = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count(); | |
| printf("Iteration %u: %0.03f ms (%0.03f GiB/s).\n", i, ns * 1e-6, (double)bytes / ns); | |
| } | |
| return 0; | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
How to run
hipcc, run the HIP version withHSA_XNACK=1setnvccwithSCALE_AMD_XNACK=1set.Commentary
The benchmark allocates a big chunk of managed memory, and then runs gpu kernels against it over and over. This is simulates the iterative style of computation done by CFD workloads, but is by no means unique to that kind of workload. After each iteration, the time gets printed out.
Managed memory is a CUDA feature which allows you to allocate memory once, and leave it up to the driver/runtime to handle moving it between CPU and GPU. This makes programs simpler because you don't have to manually move the data around, but it relies on the platform to do that relocation for you efficiently.
This benchmark represents a unidirectional transfer: the memory starts on the host, and is only used on the GPU. Once all of the memory has been relocated to the GPU, the timings stop changing, since migrations have stopped happening. A workload that alternates host/device use of the memory would be even more strongly affected by the differences in page relocation speed between the platforms.
SCALE performs the page relocations much faster than HIP. This is visible in the dramatically improved timings of the early iterations. Even for a workload that only does a unidirectional transfer, the effect on oveall timing is significant. Copying 32GB of input data at 0.007GB/s is liable to take longer than your entire CFD simulation!
Sample results
MI355
HIP (rocm 7.1)
SCALE 1.7.0:
MI210
HIP (rocm 7.1)
SCALE 1.7.0: