Skip to content

Instantly share code, notes, and snippets.

@andr01d-1
andr01d-1 / reduce_perf_event_recorder.cu
Created December 25, 2025 21:08
Using even recorder API for naive (sum) reduction kernel benchmarking
#include <iostream>
#include <vector>
#include <cuda_runtime.h>
// reduce0 kernel: Sums elements using interleaved shared memory addressing
__global__ void reduce0(int *g_idata, int *g_odata) {
extern __shared__ int sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
#include <Eigen/Core>
#include <Eigen/Geometry>
#include <iostream>
/*Eigen::Quaterniond multiply(Eigen::Quaterniond a, Eigen::Quaterniond b) {
a.normalize();
b.normalize();
Eigen::Quaterniond c = Eigen::Quaterniond::Identity();
c.w() = a.w() * b.w() - a.vec().transpose() * b.vec();