-
-
Save rinfz/d39e23699f4fdacae9642a43dce7ee5e to your computer and use it in GitHub Desktop.
GDAL + CUDA - (a.cu: basic addition kernel) (b.cu: trig distance from an origin point)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cassert> | |
#include <cstdio> | |
#include <vector> | |
#include <gdal/gdal_priv.h> | |
__global__ void | |
vector_add(const int16_t* in, int16_t* out, int num_elements) | |
{ | |
int i = blockDim.x * blockIdx.x + threadIdx.x; | |
if (i < num_elements) { | |
out[i] = in[i] + 1000; | |
} | |
} | |
int | |
main() | |
{ | |
GDALAllRegister(); | |
GDALDataset *dataset = (GDALDataset*) GDALOpen("data/N33W099.hgt", GA_ReadOnly); | |
GDALRasterBand *band = dataset->GetRasterBand(1); | |
int x_size = band->GetXSize(); | |
int y_size = band->GetYSize(); | |
int num_elements = x_size * y_size; | |
int size = num_elements * sizeof(int16_t); | |
std::vector<int16_t> buf(num_elements); | |
std::vector<int16_t> out(num_elements); | |
auto gdal_err = band->RasterIO(GF_Read, 0, 0, x_size, y_size, buf.data(), x_size, y_size, GDT_Int16, 0, 0); | |
if (gdal_err) { | |
fprintf(stderr, "Failed to read dataset with GDAL\n"); | |
exit(EXIT_FAILURE); | |
} | |
GDALClose((GDALDatasetH) dataset); | |
// END GDAL CODE | |
cudaError_t err = cudaSuccess; | |
// allocate device input | |
int16_t *device_buf = nullptr; | |
err = cudaMalloc((void **)&device_buf, size); | |
if (err != cudaSuccess) { | |
fprintf(stderr, "Failed to allocate device buffer (error code: %s)\n", cudaGetErrorString(err)); | |
exit(EXIT_FAILURE); | |
} | |
// allocate device output buffer | |
int16_t *device_out = nullptr; | |
err = cudaMalloc((void **)&device_out, size); | |
if (err != cudaSuccess) { | |
fprintf(stderr, "Failed to allocate device output buffer (error code: %s)\n", cudaGetErrorString(err)); | |
exit(EXIT_FAILURE); | |
} | |
// copy input buf from host to device | |
err = cudaMemcpy(device_buf, buf.data(), size, cudaMemcpyHostToDevice); | |
if (err != cudaSuccess) { | |
fprintf(stderr, "Failed to copy buf from host to device (error code: %s)\n", cudaGetErrorString(err)); | |
exit(EXIT_FAILURE); | |
} | |
int threads_per_block = 256; | |
int blocks_per_grid = (num_elements + threads_per_block - 1) / threads_per_block; | |
printf("CUDA kernel launch with %d blocks of %d threads\n", blocks_per_grid, threads_per_block); | |
vector_add<<<blocks_per_grid, threads_per_block>>>(device_buf, device_out, num_elements); | |
err = cudaGetLastError(); | |
if (err != cudaSuccess) { | |
fprintf(stderr, "Failed to launch vector_add kernel (error code: %s)\n", cudaGetErrorString(err)); | |
exit(EXIT_FAILURE); | |
} | |
// copy the device result from device memory to host memory | |
err = cudaMemcpy(out.data(), device_out, size, cudaMemcpyDeviceToHost); | |
if (err != cudaSuccess) { | |
fprintf(stderr, "Failed to copy result from device to host (error code: %s)\n", cudaGetErrorString(err)); | |
exit(EXIT_FAILURE); | |
} | |
for (int i = 0; i < num_elements; i++) { | |
assert(buf[i] + 1000 == out[i]); | |
} | |
printf("Tests passed!\n"); | |
cudaFree(device_buf); | |
cudaFree(device_out); | |
printf("Done\n"); | |
return 0; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cassert> | |
#include <chrono> | |
#include <cmath> | |
#include <cstdio> | |
#include <vector> | |
#include <gdal/gdal_priv.h> | |
__global__ void | |
vector_add(const int16_t* in, float* out, int num_elements, int ox, int oy, int res) | |
{ | |
int x = blockDim.x * blockIdx.x + threadIdx.x; | |
int y = blockDim.y * blockIdx.y + threadIdx.y; | |
int i = y * (gridDim.x * blockDim.x) + x; | |
if (i < num_elements) { | |
float distance = res * std::sqrt(std::pow(std::abs(ox - x), 2) + std::pow(std::abs(oy - y), 2)); | |
out[i] = distance; | |
} | |
} | |
int | |
main() | |
{ | |
GDALAllRegister(); | |
GDALDataset *dataset = (GDALDataset*) GDALOpen("data/N33W099.hgt", GA_ReadOnly); | |
GDALRasterBand *band = dataset->GetRasterBand(1); | |
int x_size = band->GetXSize(); | |
int y_size = band->GetYSize(); | |
int num_elements = x_size * y_size; | |
int size = num_elements * sizeof(int16_t); | |
std::vector<int16_t> buf(num_elements); | |
std::vector<float> out(num_elements); | |
auto gdal_err = band->RasterIO(GF_Read, 0, 0, x_size, y_size, buf.data(), x_size, y_size, GDT_Int16, 0, 0); | |
if (gdal_err) { | |
fprintf(stderr, "Failed to read dataset with GDAL\n"); | |
exit(EXIT_FAILURE); | |
} | |
GDALClose((GDALDatasetH) dataset); | |
// END GDAL CODE | |
printf("Processing %d elements\n", num_elements); | |
cudaError_t err = cudaSuccess; | |
// allocate device input | |
int16_t *device_buf = nullptr; | |
err = cudaMalloc((void **)&device_buf, size); | |
if (err != cudaSuccess) { | |
fprintf(stderr, "Failed to allocate device buffer (error code: %s)\n", cudaGetErrorString(err)); | |
exit(EXIT_FAILURE); | |
} | |
// allocate device output buffer | |
float *device_out = nullptr; | |
err = cudaMalloc((void **)&device_out, num_elements * sizeof(float)); | |
if (err != cudaSuccess) { | |
fprintf(stderr, "Failed to allocate device output buffer (error code: %s)\n", cudaGetErrorString(err)); | |
exit(EXIT_FAILURE); | |
} | |
// copy input buf from host to device | |
err = cudaMemcpy(device_buf, buf.data(), size, cudaMemcpyHostToDevice); | |
if (err != cudaSuccess) { | |
fprintf(stderr, "Failed to copy buf from host to device (error code: %s)\n", cudaGetErrorString(err)); | |
exit(EXIT_FAILURE); | |
} | |
dim3 threads_per_block(16, 16); | |
dim3 blocks_per_grid(1 + x_size / threads_per_block.x, 1 + y_size / threads_per_block.y); | |
auto start = std::chrono::high_resolution_clock::now(); | |
vector_add<<<blocks_per_grid, threads_per_block>>>(device_buf, device_out, num_elements, x_size / 2, y_size / 2, 30); | |
auto dur = std::chrono::duration_cast<std::chrono::duration<double>>(std::chrono::high_resolution_clock::now() - start); | |
printf("%fus\n", dur * 1'000'000); | |
err = cudaGetLastError(); | |
if (err != cudaSuccess) { | |
fprintf(stderr, "Failed to launch vector_add kernel (error code: %s)\n", cudaGetErrorString(err)); | |
exit(EXIT_FAILURE); | |
} | |
// copy the device result from device memory to host memory | |
err = cudaMemcpy(out.data(), device_out, size, cudaMemcpyDeviceToHost); | |
if (err != cudaSuccess) { | |
fprintf(stderr, "Failed to copy result from device to host (error code: %s)\n", cudaGetErrorString(err)); | |
exit(EXIT_FAILURE); | |
} | |
cudaFree(device_buf); | |
cudaFree(device_out); | |
printf("Done\n"); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment