Skip to content

Instantly share code, notes, and snippets.

View Ediolot's full-sized avatar
🛰️
:o

Jorge Sierra Ediolot

🛰️
:o
View GitHub Profile
import numpy as np
class Vector:
def __init__(self, w, h, d, n=None):
self.w = w
self.h = h
self.d = d
self.n = n
self.half_w = w // 2
@Ediolot
Ediolot / vector_add.cu
Created January 27, 2023 11:49
CUDA Vector add example
#include <iostream>
__global__ void vectorAdd(const float *A, const float *B, float *C, uint32_t N) {
const uint32_t threads_per_block = blockDim.x;
const uint32_t total_blocks = gridDim.x;
const uint32_t block_id = blockIdx.x;
const uint32_t thread_id = threadIdx.x;
const uint32_t total_threads = total_blocks * threads_per_block;
const uint32_t idx = block_id * threads_per_block + thread_id;
@Ediolot
Ediolot / gpu_cpu_performance_test.cu
Created January 27, 2023 12:15
GPU and CPU performance test
#include <iostream>
#include <chrono>
__global__ void vectorAdd(const float *A, const float *B, float *C, uint32_t N) {
const uint32_t threads_per_block = blockDim.x;
const uint32_t total_blocks = gridDim.x;
const uint32_t block_id = blockIdx.x;
const uint32_t thread_id = threadIdx.x;
const uint32_t total_threads = total_blocks * threads_per_block;
@Ediolot
Ediolot / vector_add_python.cu
Created January 27, 2023 12:50
Vector addition in CUDA and Torch
#include <torch/extension.h>
#include <cstdint>
__global__ void kernel_vector_add(uint32_t N, float* a, float* b, float* c) {
uint32_t idx = blockIdx.x * blockDim.x + threadIdx.x;
uint32_t max_threads = gridDim.x * blockDim.x;
for (uint32_t i = idx; i < N; i += max_threads) {
c[i] = a[i] + b[i];
}