Skip to content

Instantly share code, notes, and snippets.

@samnordmann
samnordmann / collective_communication.cpp
Created December 10, 2025 10:42
benchmarks/cpp/collective_communication.cpp
// clang-format off
/*
* SPDX-FileCopyrightText: Copyright (c) 2025-present NVIDIA CORPORATION & AFFILIATES.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*/
// clang-format on
#include <benchmark/benchmark.h>
#include <cuda_runtime.h>
@samnordmann
samnordmann / CMakeLists.txt
Created November 24, 2025 16:01
Pytorch's CUDASymmetricMemory Standalone Example
cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
project(CUDASymmetricMemoryExample LANGUAGES CXX CUDA)
# Set C++ standard
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CUDA_STANDARD 17)
set(CMAKE_CUDA_STANDARD_REQUIRED ON)
# Find required packages
namespace {
// Helper function to find the position of the first stream-parallelized axis in a domain. Returns -1 if no stream-parallelized axis is found.
// This function throws if multiple stream-parallelized axes are found (only one
// is allowed)
int64_t getStreamAxisPos(const std::vector<IterDomain*>& domain) {
int64_t ret = -1;
for (int64_t i = 0; i < (int64_t)domain.size(); i++) {
if (domain[i]->getParallelType() == ParallelType::Stream) {
NVF_CHECK(