youkaichao youkaichao

## wait.cu
#include <cstdio>
#include <iostream>
#include <cuda_runtime.h>

__global__ void waitKernel(volatile bool *flag) {
    // Busy-wait loop
    while (!*flag) {
        // The use of volatile ensures that the GPU fetches the flag value from memory each time
        // This is necessary because without volatile, the compiler might optimize the memory read
        __threadfence_system(); // Optional for system-wide memory coherence

## test.py
from vllm import LLM, SamplingParams
prompts = [
    "Hello, my name is",
    "The president of the United States is",
    "The capital of France is",
    "The future of AI is",
]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)

from contextlib import nullcontext

## audit.c
#define _GNU_SOURCE
#include <stdio.h>
#include <link.h>
#include <stdbool.h>
#include <string.h>
#include <stdlib.h>

typedef int cudaError_t;
typedef void* cudaGraph_t;

## test.py
# run the code with `torchrun --nproc-per-node 4 test.py`
import os
os.environ['NCCL_DEBUG'] = 'TRACE'
import torch
import torch.distributed as dist

# nccl communicators are lazily created
dist.init_process_group(backend='nccl')
print("init done")

## test.py
import torch
import torch.distributed as dist
import os
import multiprocessing
import multiprocessing.shared_memory
import io
import pickle

N_warmup = 10 # warmup N_warmup times
N = 100 # repeat N times

## gist:7ed49dcb55b2e66dfd841b1a9b0bfeff
import torch
from torch import nn
import copy

class BackboneModel(nn.Module):
    def __init__(self, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.conv1 = nn.Conv2d(16, 16, 6)
        self.bn1 = nn.BatchNorm2d(16)
    def forward(self, x):

## dynamic_conv_bn.py
import torch
from torch import nn
import copy

from torch.fx.experimental.efficient_conv_bn_eval import turn_on_efficient_conv_bn_eval


class BackboneModel(nn.Module):
    def __init__(self, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)

## syntax.s
# --------
# Hardware
# --------

# Opcode - operational code
# Assebly mnemonic - abbreviation for an operation

# Instruction Code Format (IA-32)
# - Optional instruction prefix
# - Operational code
	#include <cstdio>
	#include <iostream>
	#include <cuda_runtime.h>

	__global__ void waitKernel(volatile bool *flag) {
	// Busy-wait loop
	while (!*flag) {
	// The use of volatile ensures that the GPU fetches the flag value from memory each time
	// This is necessary because without volatile, the compiler might optimize the memory read
	__threadfence_system(); // Optional for system-wide memory coherence
	from vllm import LLM, SamplingParams
	prompts = [
	"Hello, my name is",
	"The president of the United States is",
	"The capital of France is",
	"The future of AI is",
	]
	sampling_params = SamplingParams(temperature=0.8, top_p=0.95)

	from contextlib import nullcontext
	#define _GNU_SOURCE
	#include <stdio.h>
	#include <link.h>
	#include <stdbool.h>
	#include <string.h>
	#include <stdlib.h>

	typedef int cudaError_t;
	typedef void* cudaGraph_t;
	# run the code with `torchrun --nproc-per-node 4 test.py`
	import os
	os.environ['NCCL_DEBUG'] = 'TRACE'
	import torch
	import torch.distributed as dist

	# nccl communicators are lazily created
	dist.init_process_group(backend='nccl')
	print("init done")
	import torch
	from torch import nn
	import copy

	class BackboneModel(nn.Module):
	def __init__(self, args, *kwargs) -> None:
	super().__init__(args, *kwargs)
	self.conv1 = nn.Conv2d(16, 16, 6)
	self.bn1 = nn.BatchNorm2d(16)
	def forward(self, x):
	# --------
	# Hardware
	# --------

	# Opcode - operational code
	# Assebly mnemonic - abbreviation for an operation

	# Instruction Code Format (IA-32)
	# - Optional instruction prefix
	# - Operational code