-
-
Save sonots/41daaa6432b1c8b27ef782cd14064269 to your computer and use it in GitHub Desktop.
#include <sys/time.h> | |
#include <cuda_runtime.h> | |
#include <stdio.h> | |
void test(int size) | |
{ | |
float *d1, *d2; | |
cudaMalloc(&d1, size); | |
cudaMalloc(&d2, size); | |
printf("Alignment: %ld\n", (d2 - d1) * sizeof(float)); | |
cudaFree(d1); | |
cudaFree(d2); | |
} | |
int main(int argc, char **argv) | |
{ | |
// set up device | |
int dev = 0; | |
cudaDeviceProp deviceProp; | |
cudaGetDeviceProperties(&deviceProp, dev); | |
printf("Using Device %d: %s\n", dev, deviceProp.name); | |
cudaSetDevice(dev); | |
test(1); | |
return(0); | |
} |
This document says as https://www.classes.cs.uchicago.edu/archive/2011/winter/32102-1/reading/CUDA_C_Best_Practices_Guide.pdf
cudaMalloc(), is guaranteed to be aligned to at least 256 bytes
But, it looks actually aligned to at least 512 bytes. It looks the document is old.
ref. https://stackoverflow.com/questions/36534599/cuda-malloc-minimum-and-typical-actual-alignment
256 on Fermi, 512 on the Kepler, Maxwell
So, it was 256 bytes for old GPU such as Fermi, but it is 512 bytes nowadays.
I was also looking into this property, and my theory for the 512 bytes alignment is that the device pointer returned by cudaMalloc
could be used to back the texture memory, which requires the data to be aligned to textureAlignment
(=512 bytes on V100). See, e.g., the cudaResourceDesc
session in CUDA Runtime API.
AWS p2.xlarge (k80)