Created
June 16, 2024 18:41
-
-
Save gboncoffee/6e838206d22f1fa2b878018f98ad8a44 to your computer and use it in GitHub Desktop.
Merge Sort running on Nvidia GPU via CUDA
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
void coutArray(float array[], int n) { | |
for (auto i = 0; i < n - 1; ++i) std::cout << array[i] << ", "; | |
std::cout << array[n - 1] << std::endl; | |
} | |
__global__ void gpuCopyArray(float dest[], float src[], int n) { | |
int index = blockIdx.x * blockDim.x + threadIdx.x; | |
int stride = blockDim.x * gridDim.x; | |
for (auto i = index; i < n; i += stride) dest[i] = src[i]; | |
} | |
__global__ void gpuMerge(float srcA[], float srcB[], float dest[], int n) { | |
auto idxA = 0; | |
auto idxB = 0; | |
auto i = 0; | |
while (idxA < n && idxB < n) { | |
if (srcA[idxA] < srcB[idxB]) { | |
dest[i] = srcA[idxA]; | |
++idxA; | |
} else { | |
dest[i] = srcB[idxB]; | |
++idxB; | |
} | |
++i; | |
} | |
for (; idxB < n; ++idxB) { | |
dest[i] = srcB[idxB]; | |
++i; | |
} | |
for (; idxA < n; ++idxA) { | |
dest[i] = srcA[idxA]; | |
++i; | |
} | |
} | |
__global__ void gpuMergeSort(float arr[], float aux[], int n) { | |
if (n / 2 == 1) return; | |
int mid = n / 2; | |
if (threadIdx.x == 1) { | |
arr = &arr[mid]; | |
aux = &aux[mid]; | |
n = n - mid; | |
} else { | |
n = mid; | |
} | |
mid = n / 2; | |
gpuMergeSort<<<1, 2>>>(arr, aux, n); | |
gpuMerge<<<1, 1>>>(arr, &arr[mid], aux, mid); | |
gpuCopyArray<<<1, n>>>(arr, aux, n); | |
} | |
void mergeSort(float arr[], float aux[], int n) { | |
int mid = n / 2; | |
gpuMergeSort<<<1, 2>>>(arr, aux, n); | |
gpuMerge<<<1, 1>>>(arr, &arr[mid], aux, mid); | |
gpuCopyArray<<<1, n>>>(arr, aux, n); | |
} | |
int main() { | |
const int N = 16; | |
float *arr; | |
float *aux; | |
cudaMallocManaged(&arr, N * sizeof(float)); | |
cudaMallocManaged(&aux, N * sizeof(float)); | |
for (auto i = 0; i < N; ++i) { | |
arr[i] = N - i; | |
} | |
coutArray(arr, N); | |
mergeSort(arr, aux, N); | |
cudaDeviceSynchronize(); | |
coutArray(aux, N); | |
cudaFree(arr); | |
cudaFree(aux); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment