Skip to content

Instantly share code, notes, and snippets.

View Jackmin801's full-sized avatar

Jackmin801

View GitHub Profile
@Jackmin801
Jackmin801 / test_deepep.py
Created February 26, 2026 07:21
DeepEP buffer init test
"""Minimal DeepEP buffer init test — no model loading needed.
Run on both nodes:
Node 0: torchrun --nnodes=2 --node-rank=0 --nproc-per-node=1 --master-addr=172.16.2.222 --master-port=29500 test_deepep.py
Node 1: torchrun --nnodes=2 --node-rank=1 --nproc-per-node=1 --master-addr=172.16.2.222 --master-port=29500 test_deepep.py
"""
import os
import time
import torch
import torch.distributed as dist
@Jackmin801
Jackmin801 / a.md
Created February 25, 2026 05:50
H100 throughputs

Benchmark: openai/GPT-OSS-120B

Server Configuration

Parameter Value
Model openai/GPT-OSS-120B
TP 8
Max Model Len 32,768
KV Cache Tokens 5,512,720
@Jackmin801
Jackmin801 / a.md
Created February 25, 2026 05:49
H200 throughputs

Benchmark: zai-org/GLM-4.7

Server Configuration

Parameter Value
Model zai-org/GLM-4.7
TP 8
Max Model Len 65,536
KV Cache Tokens 1,700,000
@Jackmin801
Jackmin801 / full_error_log_fused_lora_moe.py
Created January 15, 2026 18:42
Full error log for fused lora moe in vllm
❯ uv run vllm serve --model Qwen/Qwen3-30B-A3B --enable-lora --max-model-len 8192 --enforce-eager
WARNING 01-15 18:27:46 [argparse_utils.py:195] With `vllm serve`, you should provide the model as a positional argument or in a config file instead of via the `--model` option. The `--model` option will be removed in v0.13.
(APIServer pid=10987) INFO 01-15 18:27:46 [api_server.py:1351] vLLM API server version 0.13.0
(APIServer pid=10987) INFO 01-15 18:27:46 [utils.py:253] non-default args: {'model_tag': 'Qwen/Qwen3-30B-A3B', 'model': 'Qwen/Qwen3-30B-A3B', 'max_model_len': 8192, 'enforce_eager': True, 'enable_lora': True}
(APIServer pid=10987) INFO 01-15 18:27:47 [model.py:514] Resolved architecture: Qwen3MoeForCausalLM
(APIServer pid=10987) INFO 01-15 18:27:47 [model.py:1661] Using max model len 8192
(APIServer pid=10987) INFO 01-15 18:27:47 [scheduler.py:230] Chunked prefill is enabled with max_num_batched_tokens=8192.
(APIServer pid=10987) WARNING 01-15 18:27:47 [vllm.py:622] Enforce eager set, overriding optim
@Jackmin801
Jackmin801 / meow.log
Last active December 18, 2025 06:10
vLLM cuda init bug
root@219a8dca190e:/vllm-workspace/prime-rl# uv run inference @ examples/reverse_text/rl/infer.toml --max-model-len 2048
warning: The `extra-build-dependencies` option is experimental and may change without warning. Pass `--preview-features extra-build-dependencies` to disable this warning.
/usr/local/lib/python3.12/dist-packages/pydantic/_internal/_generate_schema.py:2249: UnsupportedFieldAttributeWarning: The 'repr' attribute with value False was provided to the `Field()` function, which has no effect in the context it was used. 'repr' is field-specific metadata, and can only be attached to a model field using `Annotated` metadata or by assignment. This may have happened because an `Annotated` type alias using the `type` statement was used, or if the `Field()` function was attached to a single member of a union type.
warnings.warn(
/usr/local/lib/python3.12/dist-packages/pydantic/_internal/_generate_schema.py:2249: UnsupportedFieldAttributeWarning: The 'frozen' attribute with value True was provided to the
@Jackmin801
Jackmin801 / meow.py
Last active September 29, 2025 23:37
state dict hook testing
import torch.distributed as dist
from torch.distributed.device_mesh import init_device_mesh
import torch
from torch.distributed.fsdp import MixedPrecisionPolicy, fully_shard
from torch.distributed.checkpoint import HuggingFaceStorageReader
import torch.distributed.checkpoint as dcp
from transformers import AutoModelForCausalLM
from huggingface_hub import snapshot_download
def sd_pre_hook(module, prefix, keep_vars):
@Jackmin801
Jackmin801 / meow.py
Created September 17, 2025 05:39
Load hf to dcp
import torch.distributed as dist
from torch.distributed.device_mesh import init_device_mesh
import torch
from torch.distributed.fsdp import MixedPrecisionPolicy, fully_shard
from torch.distributed.checkpoint import HuggingFaceStorageReader, DefaultLoadPlanner
import torch.distributed.checkpoint as dcp
from transformers import AutoModelForCausalLM
from huggingface_hub import snapshot_download
from prime_rl.utils.tensor_hashing import get_tensor_signature, get_module_signature
@Jackmin801
Jackmin801 / load_hf_dcp.py
Created September 13, 2025 19:54
Load dcp checkpoint from safetensors
import torch.distributed as dist
from torch.distributed.device_mesh import init_device_mesh
import torch
from torch.distributed.fsdp import MixedPrecisionPolicy, fully_shard
from torch.distributed.checkpoint import HuggingFaceStorageReader
import torch.distributed.checkpoint as dcp
from transformers import AutoModelForCausalLM
def main():
dist.init_process_group("nccl")
import os
import time
import torch
import torch.distributed as dist
import torch.nn.functional as F
from torch.distributed.device_mesh import init_device_mesh
from torch.distributed.tensor.experimental import context_parallel
from torch.distributed.tensor.experimental._attention import context_parallel_unshard
from torch.nn.attention import sdpa_kernel, SDPBackend
@Jackmin801
Jackmin801 / qwen30b-a3b-tt-exporting.py
Last active October 1, 2025 18:35
qwen30b-a3b tt exporting
# %%
import torch
from safetensors import safe_open
from safetensors.torch import save_file
from pathlib import Path
#save_file(tensors, "model.safetensors", metadata={"format": "pt"})
safetensor_paths = list(Path("/root/old_safetensors").glob("*.safetensors"))
state_dict = {}