-
-
Save dangra/f8123001fe0f2453a8cd638b89738465 to your computer and use it in GitHub Desktop.
FLy.io CUDA example from https://cuda-tutorial.readthedocs.io/en/latest/tutorials/tutorial02/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$ fly apps create cuda-vector-add | |
New app created: cuda-vector-add | |
$ fly deploy --ha=false | |
==> Verifying app config | |
Validating /Users/daniel/Downloads/app/fly.toml | |
Platform: machines | |
✓ Configuration is valid | |
--> Verified app config | |
==> Building image | |
Remote builder fly-builder-silent-sea-670 ready | |
==> Building image with Docker | |
... | |
--> Pushing image done | |
image: registry.fly.io/cuda-vector-add:deployment-01HAMY8C7KJDMRGQ4NE5YYEC7C | |
image size: 384 MB | |
Watch your deployment at https://fly.io/apps/cuda-vector-add/monitoring | |
This deployment will: | |
* create 1 "app" machine | |
No machines in group app, launching a new machine | |
Machine d5683ded5ce58e [app] update finished: success | |
Finished launching new machines | |
$ fly ssh console | |
Connecting to fdaa:2:f664:a7b:134:1cc7:127b:2... complete | |
root@d5683ded5ce58e:/# vector_add_grid | |
PASSED |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
FROM ubuntu:22.04 as base | |
RUN apt update -q && apt install -y ca-certificates wget && \ | |
wget -qO /cuda-keyring.deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb && \ | |
dpkg -i /cuda-keyring.deb && apt update -q | |
FROM base as build | |
RUN apt install -y --no-install-recommends cuda-nvcc-12-2 | |
COPY vector_add_grid.cu /app/ | |
RUN /usr/local/cuda-12.2/bin/nvcc /app/vector_add_grid.cu -o /app/vector_add_grid | |
FROM base | |
COPY --from=build /app/vector_add_grid /usr/local/bin/ | |
CMD ["sleep", "inf"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
app = "cuda-vector-add" | |
primary_region = "iad" | |
vm.size = "a100-80gb" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <stdlib.h> | |
#include <math.h> | |
#include <assert.h> | |
#include <cuda.h> | |
#include <cuda_runtime.h> | |
#define N 10000000 | |
#define MAX_ERR 1e-6 | |
__global__ void vector_add(float *out, float *a, float *b, int n) { | |
int tid = blockIdx.x * blockDim.x + threadIdx.x; | |
// Handling arbitrary vector size | |
if (tid < n){ | |
out[tid] = a[tid] + b[tid]; | |
} | |
} | |
int main(){ | |
float *a, *b, *out; | |
float *d_a, *d_b, *d_out; | |
// Allocate host memory | |
a = (float*)malloc(sizeof(float) * N); | |
b = (float*)malloc(sizeof(float) * N); | |
out = (float*)malloc(sizeof(float) * N); | |
// Initialize host arrays | |
for(int i = 0; i < N; i++){ | |
a[i] = 1.0f; | |
b[i] = 2.0f; | |
} | |
// Allocate device memory | |
cudaMalloc((void**)&d_a, sizeof(float) * N); | |
cudaMalloc((void**)&d_b, sizeof(float) * N); | |
cudaMalloc((void**)&d_out, sizeof(float) * N); | |
// Transfer data from host to device memory | |
cudaMemcpy(d_a, a, sizeof(float) * N, cudaMemcpyHostToDevice); | |
cudaMemcpy(d_b, b, sizeof(float) * N, cudaMemcpyHostToDevice); | |
// Executing kernel | |
int block_size = 256; | |
int grid_size = ((N + block_size) / block_size); | |
vector_add<<<grid_size,block_size>>>(d_out, d_a, d_b, N); | |
// Transfer data back to host memory | |
cudaMemcpy(out, d_out, sizeof(float) * N, cudaMemcpyDeviceToHost); | |
// Verification | |
for(int i = 0; i < N; i++){ | |
assert(fabs(out[i] - a[i] - b[i]) < MAX_ERR); | |
} | |
printf("PASSED\n"); | |
// Deallocate device memory | |
cudaFree(d_a); | |
cudaFree(d_b); | |
cudaFree(d_out); | |
// Deallocate host memory | |
free(a); | |
free(b); | |
free(out); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment