This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
import torchao | |
from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig | |
# benchmark the performance | |
import torch.utils.benchmark as benchmark | |
def benchmark_fn(f, *args, **kwargs): | |
# Manual warmup | |
for _ in range(5): |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
import torchao | |
from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig | |
# benchmark the performance | |
import torch.utils.benchmark as benchmark | |
def benchmark_fn(f, *args, **kwargs): | |
# Manual warmup | |
for _ in range(5): |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{"GEMV": {}, "GEMV_REVSPLITK": {"(1, 1536, 4096, 64, 8)": {"BLOCK_SIZE_M": 1, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 16, "A_load_order": 0, "meta_evict_policy": "", "atomic_mode": "relaxed", "dot_prod_mode": 0, "num_warps": 2, "num_ctas": 1, "num_stages": 1}, "(1, 4096, 1024, 64, 8)": {"BLOCK_SIZE_M": 1, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 8, "A_load_order": 1, "meta_evict_policy": "", "atomic_mode": "relaxed", "dot_prod_mode": 0, "num_warps": 2, "num_ctas": 1, "num_stages": 1}, "(1, 7168, 4096, 64, 8)": {"BLOCK_SIZE_M": 1, "BLOCK_SIZE_N": 512, "BLOCK_SIZE_K": 8, "A_load_order": 0, "meta_evict_policy": "", "atomic_mode": "relaxed", "dot_prod_mode": 0, "num_warps": 4, "num_ctas": 1, "num_stages": 1}, "(1, 4096, 3584, 64, 8)": {"BLOCK_SIZE_M": 1, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 16, "A_load_order": 0, "meta_evict_policy": "", "atomic_mode": "relaxed", "dot_prod_mode": 0, "num_warps": 2, "num_ctas": 1, "num_stages": 1}, "(1, 6144, 4096, 64, 8)": {"BLOCK_SIZE_M": 1, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 8, "A_l |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
model.safetensors.index.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 23.9k/23.9k [00:00<00:00, 125MB/s] | |
model-00001-of-00004.safetensors: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4.98G/4.98G [01:58<00:00, 42.0MB/s] | |
model-00002-of-00004.safetensors: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5.00G/5.00G [01:58<00:00, 42.2MB/s] | |
model-00003-of-00004.safetensors: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4.92G/4.92G [01:56<00:00, 42.0MB/s] | |
model-00004-of-00004.safetensors: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Processed prompts: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:05<00:00, 5.77s/it, est. speed input: 1.39 toks/s, output: 177.49 toks/s] | |
1 | |
Processed prompts: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:05<00:00, 5.57s/it, est. speed input: 2.16 toks/s, output: 183.93 toks/s] | |
4 | |
Processed prompts: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:05<00:00, 1.44s/it, est. speed input: 8.35 toks/s, output: 518.36 toks/s] | |
8 | |
Processed prompts: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:06<00:00, 1.33it/s, est. speed input: 18.29 toks/s, output: 1052.92 toks/s] | |
16 | |
Processed prompts: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:06<00:00, 2.33it |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{"GEMV": {}, "GEMV_REVSPLITK": {"(1, 6144, 4096, 64, 8)": {"BLOCK_SIZE_M": 1, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 8, "A_load_order": 0, "meta_evict_policy": "", "atomic_mode": "relaxed", "dot_prod_mode": 0, "num_warps": 2, "num_ctas": 1, "num_stages": 1}, "(1, 4096, 4096, 64, 8)": {"BLOCK_SIZE_M": 1, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 16, "A_load_order": 1, "meta_evict_policy": "", "atomic_mode": "relaxed", "dot_prod_mode": 0, "num_warps": 2, "num_ctas": 1, "num_stages": 2}, "(1, 28672, 4096, 64, 8)": {"BLOCK_SIZE_M": 1, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 8, "A_load_order": 1, "meta_evict_policy": "", "atomic_mode": "relaxed", "dot_prod_mode": 0, "num_warps": 2, "num_ctas": 1, "num_stages": 2}, "(1, 4096, 14336, 64, 8)": {"BLOCK_SIZE_M": 1, "BLOCK_SIZE_N": 512, "BLOCK_SIZE_K": 8, "A_load_order": 0, "meta_evict_policy": "", "atomic_mode": "relaxed", "dot_prod_mode": 0, "num_warps": 4, "num_ctas": 1, "num_stages": 2}, "(1, 6144, 4096, 64, 2)": {"BLOCK_SIZE_M": 1, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "A |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[2024-12-19 10:59:34 TP5] Scheduler hit an exception: Traceback (most recent call last): | |
File "/data/users/jerryzh/sglang/python/sglang/srt/managers/scheduler.py", line 1528, in run_scheduler_process | |
scheduler = Scheduler(server_args, port_args, gpu_id, tp_rank, dp_rank) | |
File "/data/users/jerryzh/sglang/python/sglang/srt/managers/scheduler.py", line 192, in __init__ | |
self.tp_worker = TpWorkerClass( | |
File "/data/users/jerryzh/sglang/python/sglang/srt/managers/tp_worker_overlap_thread.py", line 62, in __init__ | |
self.worker = TpModelWorker(server_args, gpu_id, tp_rank, dp_rank, nccl_port) | |
File "/data/users/jerryzh/sglang/python/sglang/srt/managers/tp_worker.py", line 62, in __init__ | |
self.model_runner = ModelRunner( | |
File "/data/users/jerryzh/sglang/python/sglang/srt/model_executor/model_runner.py", line 158, in __init__ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[2024-12-17 15:09:47 TP0] Decode batch. #running-req: 968, #token: 511530, token usage: 0.11, gen throughput (token/s): 1554.84, #queue-req: 0 | |
[2024-12-17 15:09:48 TP0] Decode batch. #running-req: 922, #token: 523719, token usage: 0.11, gen throughput (token/s): 47852.88, #queue-req: 0 | |
[2024-12-17 15:09:49 TP0] Decode batch. #running-req: 883, #token: 535168, token usage: 0.11, gen throughput (token/s): 46588.76, #queue-req: 0 | |
[2024-12-17 15:09:50 TP0] Decode batch. #running-req: 847, #token: 548080, token usage: 0.11, gen throughput (token/s): 44284.98, #queue-req: 0 | |
[2024-12-17 15:09:50 TP0] Decode batch. #running-req: 799, #token: 545397, token usage: 0.11, gen throughput (token/s): 42336.33, #queue-req: 0 | |
[2024-12-17 15:09:51 TP0] Decode batch. #running-req: 767, #token: 556549, token usage: 0.12, gen throughput (token/s): 41241.09, #queue-req: 0 | |
[2024-12-17 15:09:52 TP0] Decode batch. #running-req: 730, #token: 558371, token usage: 0.12, gen throughput (token/s): 39677.03, #queue-req: 0 | |
[2024-12-17 15:09 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
W1202 13:59:56.994000 2658530 site-packages/torch/_logging/_internal.py:1084] [1/0] Profiler function <class 'torch.autograd.profiler.record_function'> will be ignored 13:59:58 [117/1719] | |
Traceback (most recent call last): | |
File "/data/users/jerryzh/ao/examples/sam2_amg_server/server.py", line 709, in <module> | |
fire.Fire(main) | |
File "/home/jerryzh/.conda/envs/ao/lib/python3.10/site-packages/fire/core.py", line 135, in Fire | |
component_trace = _Fire(component, args, parsed_flag_args, context, name) | |
File "/home/jerryzh/.conda/envs/ao/lib/python3.10/site-packages/fire/core.py", line 468, in _Fire | |
component, remaining_args = _CallAndUpdateTrace( | |
File "/home/jerryzh/.conda/envs/ao/lib/python3.10/site-packages/fire/core.py", line 684, in _CallAndUpdateTrace | |
component = fn(*varargs, **kwargs) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff --git a/python/pyproject.toml b/python/pyproject.toml | |
index d9749e1..fbcc0fd 100644 | |
--- a/python/pyproject.toml | |
+++ b/python/pyproject.toml | |
@@ -20,7 +20,7 @@ runtime_common = ["aiohttp", "decord", "fastapi", "hf_transfer", "huggingface_hu | |
"orjson", "packaging", "pillow", "psutil", "pydantic", "python-multipart", | |
"torchao", "uvicorn", "uvloop", "zmq", | |
"outlines>=0.0.44", "modelscope"] | |
-srt = ["sglang[runtime_common]", "torch", "vllm==0.6.3.post1"] | |
+srt = ["sglang[runtime_common]", "torch", "vllm"] |
NewerOlder