Skip to content

Instantly share code, notes, and snippets.

View frank-wei's full-sized avatar

Wei Wei frank-wei

  • Meta
  • Menlo Park, CA
View GitHub Profile
import tensorrt as trt
import torch
# https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#python_topics
"""
TensorRT Initialization
"""
TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE)
@frank-wei
frank-wei / gist:0dd5c4d90f1bec3c3ce016230e5e66bc
Created September 9, 2025 15:06
reduce the weight loading time #24154
0
True
True
True
1
True
True
True
2
True
from openai import OpenAI
# Point to your vLLM server (default: localhost:8000)
client = OpenAI(
base_url="http://localhost:8000/v1", # adjust port if different
api_key="EMPTY" # vLLM doesn’t require authentication
)
# Choose your GPT-OSS model (must match the one you served via vllm serve)
MODEL_NAME = "/home/wwei6/local/checkpoints/gpt-oss-120b" # or "openai/gpt-oss-120b"
@frank-wei
frank-wei / gist1.txt
Last active October 10, 2025 23:34
without customized structural tag
P1977951167 - copy
FBID: 2225683307937911
(An Untitled Masterwork)
Visible to All Users
Author
wwei6
Created
Sat Oct 4, 2025 10:29pm
Forks
@frank-wei
frank-wei / gist2.txt
Created October 10, 2025 23:33
with structural tag
P1977390868 - copy
FBID: 751895307897003
(An Untitled Masterwork)
Visible to All Users
Author
wwei6
Created
Sat Oct 4, 2025 10:39am
Forks
{
"type": "structural_tag",
"format": {
"type":
"triggered_tags",
"stop_after_first":
False,
"tags": [{
"begin": "container.exec <|constrain|>json",
"content": {
server:
vllm serve /data/local/model/Qwen2.5-3B-Instruct/ --port 8081
Client:
```
from openai import OpenAI
import json
client = OpenAI(
base_url="http://localhost:8081/v1",
api_key="-",