jeromeku

## test.py
import cutlass
import cutlass.cute as cute
from cutlass._mlir.dialects import llvm
from cutlass._mlir.extras import types as T

def compare_and_swap_i32(a: cutlass.Int32, b: cutlass.Int32) -> tuple[cutlass.Int32, cutlass.Int32]:
    out_i32x2 = llvm.inline_asm(
        llvm.StructType.get_literal([T.i32(), T.i32()]),
        [cutlass.Int32(a).ir_value(), cutlass.Int32(b).ir_value()],
        "{\n\t"

## vllm_forloop.py
import time
from vllm import LLM, SamplingParams
from vllm.inputs import PromptType
from vllm.outputs import PoolingRequestOutput, RequestOutput
from typing import Union, cast, Sequence
from multiprocessing import Queue, Event
import threading

class MyLLM(LLM):
    def keep_running(

## softmax_quack.py
import argparse
import time
from typing import Type

import torch
import torch.nn.functional as F
import torch._inductor.config

torch._inductor.config.triton.multi_kernel = True

## pipeline_parallel.py
#VERBOSE=0 torchrun --nproc_per_node 3 self_contained_pp_LOC.py
import os, random, numpy as np, torch, torch.nn as nn, torch.distributed as dist, torch.nn.functional as F
from torch.optim import AdamW
from torch.utils.data import DataLoader, DistributedSampler
from datasets import load_dataset
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer

STEP, local_rank, world_size, verbose = 0, int(os.environ["LOCAL_RANK"]), int(os.environ["WORLD_SIZE"]), os.environ.get("VERBOSE", "0") == "1"

def set_all_seed(seed):

## fakepg.py
import torch
from torch import nn
from torch.distributed.tensor.placement_types import Replicate, Shard
from torch.testing._internal.distributed.fake_pg import FakeStore

import torch.distributed as dist
from torch.distributed.device_mesh import init_device_mesh
from torch.distributed.tensor import DTensor, Replicate

world_size = 4

## gist:7186a3d7a4d220a1dfebcc50f0c413a0
import time

import torch
import torch._inductor.config as config
from torch import Tensor
from torch._dynamo.device_interface import get_interface_for_device
from torch._inductor.runtime.static_cuda_launcher import StaticallyLaunchedCudaKernel
from torch._inductor.runtime.triton_compat import tl, triton

# Constants

## audit.c
#define _GNU_SOURCE
#include <stdio.h>
#include <link.h>
#include <stdbool.h>
#include <string.h>
#include <stdlib.h>

typedef int cudaError_t;
typedef void* cudaGraph_t;

## 1_results.txt
Strategy                       | Relative Throughput  | Time (s)     | Cost ($/M tokens)
----------------------------------------------------------------------------------------
Unsloth                        | 2.17                 | 3.83         | $0.0188
Unsloth+PEFT                   | 1.58                 | 5.27         | $0.0259
Transformers+Liger             | 1.14                 | 7.28         | $0.0358
vLLM                           | 1.00                 | 8.31         | $0.0409
Transformers                   | 0.97                 | 8.54         | $0.0420
Transformers+Liger+PEFT        | 0.84                 | 9.85         | $0.0484
Transformers+PEFT              | 0.74                 | 11.26        | $0.0554

## private_fork.md

      
              1 file
            
          
              0 forks
            
          
                0 comments
              
            
              0 stars
            
          
                jeromeku
                / private_fork.md
            
            
              Created
              June 6, 2025 20:42
                — forked from 0xjac/private_fork.md
            
              
                Create a private fork of a public repository
              
          
    The repository for the assignment is public and Github does not allow the creation of private forks for public repositories.
The correct way of creating a private frok by duplicating the repo is documented here.
For this assignment the commands are:

Create a bare clone of the repository.
(This is temporary and will be removed so just do it wherever.)


git clone --bare git@github.com:usi-systems/easytrace.git

  
## qwen_pretok.py
import time
import unicodedata

import regex as re

regex_pattern = r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"

# --- Helper Functions for Character Properties ---
	import cutlass
	import cutlass.cute as cute
	from cutlass._mlir.dialects import llvm
	from cutlass._mlir.extras import types as T

	def compare_and_swap_i32(a: cutlass.Int32, b: cutlass.Int32) -> tuple[cutlass.Int32, cutlass.Int32]:
	out_i32x2 = llvm.inline_asm(
	llvm.StructType.get_literal([T.i32(), T.i32()]),
	[cutlass.Int32(a).ir_value(), cutlass.Int32(b).ir_value()],
	"{\n\t"
	import time
	from vllm import LLM, SamplingParams
	from vllm.inputs import PromptType
	from vllm.outputs import PoolingRequestOutput, RequestOutput
	from typing import Union, cast, Sequence
	from multiprocessing import Queue, Event
	import threading

	class MyLLM(LLM):
	def keep_running(
	import argparse
	import time
	from typing import Type

	import torch
	import torch.nn.functional as F
	import torch._inductor.config

	torch._inductor.config.triton.multi_kernel = True
	#VERBOSE=0 torchrun --nproc_per_node 3 self_contained_pp_LOC.py
	import os, random, numpy as np, torch, torch.nn as nn, torch.distributed as dist, torch.nn.functional as F
	from torch.optim import AdamW
	from torch.utils.data import DataLoader, DistributedSampler
	from datasets import load_dataset
	from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer

	STEP, local_rank, world_size, verbose = 0, int(os.environ["LOCAL_RANK"]), int(os.environ["WORLD_SIZE"]), os.environ.get("VERBOSE", "0") == "1"

	def set_all_seed(seed):
	import torch
	from torch import nn
	from torch.distributed.tensor.placement_types import Replicate, Shard
	from torch.testing._internal.distributed.fake_pg import FakeStore

	import torch.distributed as dist
	from torch.distributed.device_mesh import init_device_mesh
	from torch.distributed.tensor import DTensor, Replicate

	world_size = 4
	import time

	import torch
	import torch._inductor.config as config
	from torch import Tensor
	from torch._dynamo.device_interface import get_interface_for_device
	from torch._inductor.runtime.static_cuda_launcher import StaticallyLaunchedCudaKernel
	from torch._inductor.runtime.triton_compat import tl, triton

	# Constants
	#define _GNU_SOURCE
	#include <stdio.h>
	#include <link.h>
	#include <stdbool.h>
	#include <string.h>
	#include <stdlib.h>

	typedef int cudaError_t;
	typedef void* cudaGraph_t;
	Strategy \| Relative Throughput \| Time (s) \| Cost ($/M tokens)
	----------------------------------------------------------------------------------------
	Unsloth \| 2.17 \| 3.83 \| $0.0188
	Unsloth+PEFT \| 1.58 \| 5.27 \| $0.0259
	Transformers+Liger \| 1.14 \| 7.28 \| $0.0358
	vLLM \| 1.00 \| 8.31 \| $0.0409
	Transformers \| 0.97 \| 8.54 \| $0.0420
	Transformers+Liger+PEFT \| 0.84 \| 9.85 \| $0.0484
	Transformers+PEFT \| 0.74 \| 11.26 \| $0.0554
	import time
	import unicodedata

	import regex as re

	regex_pattern = r"(?i:'s\|'t\|'re\|'ve\|'m\|'ll\|'d)\|[^\r\n\p{L}\p{N}]?\p{L}+\|\p{N}\| ?[^\s\p{L}\p{N}]+[\r\n]\|\s[\r\n]+\|\s+(?!\S)\|\s+"

	# --- Helper Functions for Character Properties ---