Skip to content

Instantly share code, notes, and snippets.

Last active February 13, 2024 23:19
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save dangra/f8123001fe0f2453a8cd638b89738465 to your computer and use it in GitHub Desktop.
Save dangra/f8123001fe0f2453a8cd638b89738465 to your computer and use it in GitHub Desktop.
$ fly apps create cuda-vector-add
New app created: cuda-vector-add
$ fly deploy --ha=false
==> Verifying app config
Validating /Users/daniel/Downloads/app/fly.toml
Platform: machines
✓ Configuration is valid
--> Verified app config
==> Building image
Remote builder fly-builder-silent-sea-670 ready
==> Building image with Docker
--> Pushing image done
image size: 384 MB
Watch your deployment at
This deployment will:
* create 1 "app" machine
No machines in group app, launching a new machine
Machine d5683ded5ce58e [app] update finished: success
Finished launching new machines
$ fly ssh console
Connecting to fdaa:2:f664:a7b:134:1cc7:127b:2... complete
root@d5683ded5ce58e:/# vector_add_grid
FROM ubuntu:22.04 as base
RUN apt update -q && apt install -y ca-certificates wget && \
wget -qO /cuda-keyring.deb && \
dpkg -i /cuda-keyring.deb && apt update -q
FROM base as build
RUN apt install -y --no-install-recommends cuda-nvcc-12-2
COPY /app/
RUN /usr/local/cuda-12.2/bin/nvcc /app/ -o /app/vector_add_grid
FROM base
COPY --from=build /app/vector_add_grid /usr/local/bin/
CMD ["sleep", "inf"]
app = "cuda-vector-add"
primary_region = "iad"
vm.size = "a100-80gb"
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <assert.h>
#include <cuda.h>
#include <cuda_runtime.h>
#define N 10000000
#define MAX_ERR 1e-6
__global__ void vector_add(float *out, float *a, float *b, int n) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
// Handling arbitrary vector size
if (tid < n){
out[tid] = a[tid] + b[tid];
int main(){
float *a, *b, *out;
float *d_a, *d_b, *d_out;
// Allocate host memory
a = (float*)malloc(sizeof(float) * N);
b = (float*)malloc(sizeof(float) * N);
out = (float*)malloc(sizeof(float) * N);
// Initialize host arrays
for(int i = 0; i < N; i++){
a[i] = 1.0f;
b[i] = 2.0f;
// Allocate device memory
cudaMalloc((void**)&d_a, sizeof(float) * N);
cudaMalloc((void**)&d_b, sizeof(float) * N);
cudaMalloc((void**)&d_out, sizeof(float) * N);
// Transfer data from host to device memory
cudaMemcpy(d_a, a, sizeof(float) * N, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, sizeof(float) * N, cudaMemcpyHostToDevice);
// Executing kernel
int block_size = 256;
int grid_size = ((N + block_size) / block_size);
vector_add<<<grid_size,block_size>>>(d_out, d_a, d_b, N);
// Transfer data back to host memory
cudaMemcpy(out, d_out, sizeof(float) * N, cudaMemcpyDeviceToHost);
// Verification
for(int i = 0; i < N; i++){
assert(fabs(out[i] - a[i] - b[i]) < MAX_ERR);
// Deallocate device memory
// Deallocate host memory
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment