Skip to content

Instantly share code, notes, and snippets.

@jeromeku
jeromeku / test.py
Created August 27, 2025 01:25 — forked from Observer007/test.py
cute dsl inline_asm returns more than one values
import cutlass
import cutlass.cute as cute
from cutlass._mlir.dialects import llvm
from cutlass._mlir.extras import types as T
def compare_and_swap_i32(a: cutlass.Int32, b: cutlass.Int32) -> tuple[cutlass.Int32, cutlass.Int32]:
out_i32x2 = llvm.inline_asm(
llvm.StructType.get_literal([T.i32(), T.i32()]),
[cutlass.Int32(a).ir_value(), cutlass.Int32(b).ir_value()],
"{\n\t"
import time
from vllm import LLM, SamplingParams
from vllm.inputs import PromptType
from vllm.outputs import PoolingRequestOutput, RequestOutput
from typing import Union, cast, Sequence
from multiprocessing import Queue, Event
import threading
class MyLLM(LLM):
def keep_running(
@jeromeku
jeromeku / softmax_quack.py
Created July 11, 2025 12:10 — forked from Chillee/softmax_quack.py
Random Kernel Microbenchmarks
import argparse
import time
from typing import Type
import torch
import torch.nn.functional as F
import torch._inductor.config
torch._inductor.config.triton.multi_kernel = True
@jeromeku
jeromeku / pipeline_parallel.py
Created July 9, 2025 11:35 — forked from 3outeille/pipeline_parallel.py
Self contained example of how pipeline parallel works (AFAB and 1F1B) in 200 LOC
#VERBOSE=0 torchrun --nproc_per_node 3 self_contained_pp_LOC.py
import os, random, numpy as np, torch, torch.nn as nn, torch.distributed as dist, torch.nn.functional as F
from torch.optim import AdamW
from torch.utils.data import DataLoader, DistributedSampler
from datasets import load_dataset
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
STEP, local_rank, world_size, verbose = 0, int(os.environ["LOCAL_RANK"]), int(os.environ["WORLD_SIZE"]), os.environ.get("VERBOSE", "0") == "1"
def set_all_seed(seed):
import torch
from torch import nn
from torch.distributed.tensor.placement_types import Replicate, Shard
from torch.testing._internal.distributed.fake_pg import FakeStore
import torch.distributed as dist
from torch.distributed.device_mesh import init_device_mesh
from torch.distributed.tensor import DTensor, Replicate
world_size = 4
import time
import torch
import torch._inductor.config as config
from torch import Tensor
from torch._dynamo.device_interface import get_interface_for_device
from torch._inductor.runtime.static_cuda_launcher import StaticallyLaunchedCudaKernel
from torch._inductor.runtime.triton_compat import tl, triton
# Constants
@jeromeku
jeromeku / audit.c
Created June 30, 2025 21:52 — forked from youkaichao/audit.c
enable verbose cudagraph dump for pytorch
#define _GNU_SOURCE
#include <stdio.h>
#include <link.h>
#include <stdbool.h>
#include <string.h>
#include <stdlib.h>
typedef int cudaError_t;
typedef void* cudaGraph_t;
@jeromeku
jeromeku / 1_results.txt
Created June 18, 2025 14:35 — forked from corbt/1_results.txt
Benchmark script for reward model performance
Strategy | Relative Throughput | Time (s) | Cost ($/M tokens)
----------------------------------------------------------------------------------------
Unsloth | 2.17 | 3.83 | $0.0188
Unsloth+PEFT | 1.58 | 5.27 | $0.0259
Transformers+Liger | 1.14 | 7.28 | $0.0358
vLLM | 1.00 | 8.31 | $0.0409
Transformers | 0.97 | 8.54 | $0.0420
Transformers+Liger+PEFT | 0.84 | 9.85 | $0.0484
Transformers+PEFT | 0.74 | 11.26 | $0.0554
@jeromeku
jeromeku / private_fork.md
Created June 6, 2025 20:42 — forked from 0xjac/private_fork.md
Create a private fork of a public repository

The repository for the assignment is public and Github does not allow the creation of private forks for public repositories.

The correct way of creating a private frok by duplicating the repo is documented here.

For this assignment the commands are:

  1. Create a bare clone of the repository. (This is temporary and will be removed so just do it wherever.)

git clone --bare git@github.com:usi-systems/easytrace.git

import time
import unicodedata
import regex as re
regex_pattern = r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"
# --- Helper Functions for Character Properties ---