Jackmin801

## test_deepep.py
"""Minimal DeepEP buffer init test — no model loading needed.
Run on both nodes:
  Node 0: torchrun --nnodes=2 --node-rank=0 --nproc-per-node=1 --master-addr=172.16.2.222 --master-port=29500 test_deepep.py
  Node 1: torchrun --nnodes=2 --node-rank=1 --nproc-per-node=1 --master-addr=172.16.2.222 --master-port=29500 test_deepep.py
"""
import os
import time
import torch
import torch.distributed as dist

## a.md

      
              1 file
            
          
              0 forks
            
          
                0 comments
              
            
              0 stars
            
          
                Jackmin801
                / a.md
            
            
              Created
              February 25, 2026 05:50
            
              
                H100 throughputs
              
          
    Benchmark: openai/GPT-OSS-120B

Server Configuration


Parameter
Value


Model
openai/GPT-OSS-120B


TP
8


Max Model Len
32,768


KV Cache Tokens
5,512,720


## a.md

      
              1 file
            
          
              0 forks
            
          
                0 comments
              
            
              0 stars
            
          
                Jackmin801
                / a.md
            
            
              Created
              February 25, 2026 05:49
            
              
                H200 throughputs
              
          
    Benchmark: zai-org/GLM-4.7

Server Configuration


Parameter
Value


Model
zai-org/GLM-4.7


TP
8


Max Model Len
65,536


KV Cache Tokens
1,700,000


## full_error_log_fused_lora_moe.py
❯ uv run vllm serve --model Qwen/Qwen3-30B-A3B --enable-lora --max-model-len 8192 --enforce-eager
WARNING 01-15 18:27:46 [argparse_utils.py:195] With `vllm serve`, you should provide the model as a positional argument or in a config file instead of via the `--model` option. The `--model` option will be removed in v0.13.
(APIServer pid=10987) INFO 01-15 18:27:46 [api_server.py:1351] vLLM API server version 0.13.0
(APIServer pid=10987) INFO 01-15 18:27:46 [utils.py:253] non-default args: {'model_tag': 'Qwen/Qwen3-30B-A3B', 'model': 'Qwen/Qwen3-30B-A3B', 'max_model_len': 8192, 'enforce_eager': True, 'enable_lora': True}
(APIServer pid=10987) INFO 01-15 18:27:47 [model.py:514] Resolved architecture: Qwen3MoeForCausalLM
(APIServer pid=10987) INFO 01-15 18:27:47 [model.py:1661] Using max model len 8192
(APIServer pid=10987) INFO 01-15 18:27:47 [scheduler.py:230] Chunked prefill is enabled with max_num_batched_tokens=8192.
(APIServer pid=10987) WARNING 01-15 18:27:47 [vllm.py:622] Enforce eager set, overriding optim

## meow.log
root@219a8dca190e:/vllm-workspace/prime-rl# uv run inference @ examples/reverse_text/rl/infer.toml --max-model-len 2048
warning: The `extra-build-dependencies` option is experimental and may change without warning. Pass `--preview-features extra-build-dependencies` to disable this warning.
/usr/local/lib/python3.12/dist-packages/pydantic/_internal/_generate_schema.py:2249: UnsupportedFieldAttributeWarning: The 'repr' attribute with value False was provided to the `Field()` function, which has no effect in the context it was used. 'repr' is field-specific metadata, and can only be attached to a model field using `Annotated` metadata or by assignment. This may have happened because an `Annotated` type alias using the `type` statement was used, or if the `Field()` function was attached to a single member of a union type.
  warnings.warn(
/usr/local/lib/python3.12/dist-packages/pydantic/_internal/_generate_schema.py:2249: UnsupportedFieldAttributeWarning: The 'frozen' attribute with value True was provided to the

## meow.py
import torch.distributed as dist
from torch.distributed.device_mesh import init_device_mesh
import torch
from torch.distributed.fsdp import MixedPrecisionPolicy, fully_shard
from torch.distributed.checkpoint import HuggingFaceStorageReader
import torch.distributed.checkpoint as dcp
from transformers import AutoModelForCausalLM
from huggingface_hub import snapshot_download

def sd_pre_hook(module, prefix, keep_vars):

## meow.py
import torch.distributed as dist
from torch.distributed.device_mesh import init_device_mesh
import torch
from torch.distributed.fsdp import MixedPrecisionPolicy, fully_shard
from torch.distributed.checkpoint import HuggingFaceStorageReader, DefaultLoadPlanner
import torch.distributed.checkpoint as dcp
from transformers import AutoModelForCausalLM
from huggingface_hub import snapshot_download
from prime_rl.utils.tensor_hashing import get_tensor_signature, get_module_signature

## load_hf_dcp.py
import torch.distributed as dist
from torch.distributed.device_mesh import init_device_mesh
import torch
from torch.distributed.fsdp import MixedPrecisionPolicy, fully_shard
from torch.distributed.checkpoint import HuggingFaceStorageReader
import torch.distributed.checkpoint as dcp
from transformers import AutoModelForCausalLM

def main():
    dist.init_process_group("nccl")

## context_parallel_grad_err.py
import os
import time

import torch
import torch.distributed as dist
import torch.nn.functional as F
from torch.distributed.device_mesh import init_device_mesh
from torch.distributed.tensor.experimental import context_parallel
from torch.distributed.tensor.experimental._attention import context_parallel_unshard
from torch.nn.attention import sdpa_kernel, SDPBackend

## qwen30b-a3b-tt-exporting.py
# %%
import torch
from safetensors import safe_open
from safetensors.torch import save_file
from pathlib import Path

#save_file(tensors, "model.safetensors", metadata={"format": "pt"})
safetensor_paths = list(Path("/root/old_safetensors").glob("*.safetensors"))

state_dict = {}
	"""Minimal DeepEP buffer init test — no model loading needed.
	Run on both nodes:
	Node 0: torchrun --nnodes=2 --node-rank=0 --nproc-per-node=1 --master-addr=172.16.2.222 --master-port=29500 test_deepep.py
	Node 1: torchrun --nnodes=2 --node-rank=1 --nproc-per-node=1 --master-addr=172.16.2.222 --master-port=29500 test_deepep.py
	"""
	import os
	import time
	import torch
	import torch.distributed as dist
Parameter	Value
Model	openai/GPT-OSS-120B
TP	8
Max Model Len	32,768
KV Cache Tokens	5,512,720
Parameter	Value
Model	zai-org/GLM-4.7
TP	8
Max Model Len	65,536
KV Cache Tokens	1,700,000
	❯ uv run vllm serve --model Qwen/Qwen3-30B-A3B --enable-lora --max-model-len 8192 --enforce-eager
	WARNING 01-15 18:27:46 [argparse_utils.py:195] With `vllm serve`, you should provide the model as a positional argument or in a config file instead of via the `--model` option. The `--model` option will be removed in v0.13.
	(APIServer pid=10987) INFO 01-15 18:27:46 [api_server.py:1351] vLLM API server version 0.13.0
	(APIServer pid=10987) INFO 01-15 18:27:46 [utils.py:253] non-default args: {'model_tag': 'Qwen/Qwen3-30B-A3B', 'model': 'Qwen/Qwen3-30B-A3B', 'max_model_len': 8192, 'enforce_eager': True, 'enable_lora': True}
	(APIServer pid=10987) INFO 01-15 18:27:47 [model.py:514] Resolved architecture: Qwen3MoeForCausalLM
	(APIServer pid=10987) INFO 01-15 18:27:47 [model.py:1661] Using max model len 8192
	(APIServer pid=10987) INFO 01-15 18:27:47 [scheduler.py:230] Chunked prefill is enabled with max_num_batched_tokens=8192.
	(APIServer pid=10987) WARNING 01-15 18:27:47 [vllm.py:622] Enforce eager set, overriding optim
	root@219a8dca190e:/vllm-workspace/prime-rl# uv run inference @ examples/reverse_text/rl/infer.toml --max-model-len 2048
	warning: The `extra-build-dependencies` option is experimental and may change without warning. Pass `--preview-features extra-build-dependencies` to disable this warning.
	/usr/local/lib/python3.12/dist-packages/pydantic/_internal/_generate_schema.py:2249: UnsupportedFieldAttributeWarning: The 'repr' attribute with value False was provided to the `Field()` function, which has no effect in the context it was used. 'repr' is field-specific metadata, and can only be attached to a model field using `Annotated` metadata or by assignment. This may have happened because an `Annotated` type alias using the `type` statement was used, or if the `Field()` function was attached to a single member of a union type.
	warnings.warn(
	/usr/local/lib/python3.12/dist-packages/pydantic/_internal/_generate_schema.py:2249: UnsupportedFieldAttributeWarning: The 'frozen' attribute with value True was provided to the
	import torch.distributed as dist
	from torch.distributed.device_mesh import init_device_mesh
	import torch
	from torch.distributed.fsdp import MixedPrecisionPolicy, fully_shard
	from torch.distributed.checkpoint import HuggingFaceStorageReader
	import torch.distributed.checkpoint as dcp
	from transformers import AutoModelForCausalLM
	from huggingface_hub import snapshot_download

	def sd_pre_hook(module, prefix, keep_vars):
	# %%
	import torch
	from safetensors import safe_open
	from safetensors.torch import save_file
	from pathlib import Path

	#save_file(tensors, "model.safetensors", metadata={"format": "pt"})
	safetensor_paths = list(Path("/root/old_safetensors").glob("*.safetensors"))

	state_dict = {}