|
[ |
|
{ |
|
"hw": "b200", |
|
"conc": 32, |
|
"image": "lmsysorg/sglang:v0.5.5-cu129-amd64", |
|
"model": "nvidia/DeepSeek-R1-0528-FP4-V2", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "sglang", |
|
"precision": "fp4", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 4, |
|
"ep": 4, |
|
"dp_attention": "false", |
|
"tput_per_gpu": 641.2799367245719, |
|
"output_tput_per_gpu": 321.14790576674005, |
|
"input_tput_per_gpu": 320.1320309578319, |
|
"mean_ttft": 7.429033456041362, |
|
"median_ttft": 0.4049194911494851, |
|
"std_ttft": 21.507662512924416, |
|
"p99_ttft": 73.2787645055307, |
|
"mean_tpot": 0.016500912105395376, |
|
"mean_intvty": 60.602710541863054, |
|
"median_tpot": 0.016331182684193535, |
|
"median_intvty": 61.23255243282963, |
|
"std_tpot": 0.004724498328625295, |
|
"std_intvty": 211.6626846793644, |
|
"p99_tpot": 0.01710460220207336, |
|
"p99_intvty": 58.46379753156629, |
|
"mean_itl": 0.16403007508055295, |
|
"median_itl": 0.14787723892368376, |
|
"std_itl": 0.4242816317900714, |
|
"p99_itl": 0.25230101314838976, |
|
"mean_e2el": 22.643847975885727, |
|
"median_e2el": 15.617675733519718, |
|
"std_e2el": 21.843125195522077, |
|
"p99_e2el": 90.13985173074529 |
|
}, |
|
{ |
|
"hw": "b200-trt", |
|
"conc": 16, |
|
"image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2", |
|
"model": "deepseek-ai/DeepSeek-R1-0528", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "trt", |
|
"precision": "fp8", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 8, |
|
"ep": 8, |
|
"dp_attention": "false", |
|
"tput_per_gpu": 204.9070644391866, |
|
"output_tput_per_gpu": 101.91109312229496, |
|
"input_tput_per_gpu": 102.99597131689164, |
|
"mean_ttft": 0.32274896054441343, |
|
"median_ttft": 0.162723705987446, |
|
"std_ttft": 0.5147236327123939, |
|
"p99_ttft": 2.621198953207931, |
|
"mean_tpot": 0.01887051264378247, |
|
"mean_intvty": 52.992730980707286, |
|
"median_tpot": 0.018958024481654227, |
|
"median_intvty": 52.74811207083865, |
|
"std_tpot": 0.000609820012909863, |
|
"std_intvty": 1639.8281112952081, |
|
"p99_tpot": 0.020053270736644223, |
|
"p99_intvty": 49.86717693750856, |
|
"mean_itl": 0.18765400995375867, |
|
"median_itl": 0.17171890201279894, |
|
"std_itl": 0.05690608556211947, |
|
"p99_itl": 0.3957180514396168, |
|
"mean_e2el": 17.59278183688748, |
|
"median_e2el": 17.577111709993915, |
|
"std_e2el": 1.4441395601241613, |
|
"p99_e2el": 21.31427635858592 |
|
}, |
|
{ |
|
"hw": "gb200", |
|
"conc": 4300, |
|
"image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", |
|
"model": "/mnt/lustre01/models/deepseek-r1-0528-fp4-v2", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "dynamo-trt", |
|
"precision": "fp4", |
|
"spec_decoding": "none", |
|
"disagg": true, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": true, |
|
"prefill_tp": 4, |
|
"prefill_ep": 4, |
|
"prefill_dp_attention": "true", |
|
"prefill_num_workers": 2, |
|
"decode_tp": 16, |
|
"decode_ep": 16, |
|
"decode_dp_attention": "true", |
|
"decode_num_workers": 1, |
|
"num_prefill_gpu": 8, |
|
"num_decode_gpu": 16, |
|
"tput_per_gpu": 8772.066417069726, |
|
"output_tput_per_gpu": 6576.648712718563, |
|
"input_tput_per_gpu": 13162.901825772049, |
|
"mean_ttft": 4.417850822288784, |
|
"median_ttft": 2.6825136054540053, |
|
"std_ttft": 5.863712313142769, |
|
"p99_ttft": 29.946521612133367, |
|
"mean_tpot": 0.033614227161874864, |
|
"mean_intvty": 29.749308088635647, |
|
"median_tpot": 0.03357622255888582, |
|
"median_intvty": 29.78298104398744, |
|
"std_tpot": 0.0015633774147045258, |
|
"std_intvty": 639.6408126370416, |
|
"p99_tpot": 0.037157022405837896, |
|
"p99_intvty": 26.912813117201928, |
|
"mean_itl": 0.6646102327497304, |
|
"median_itl": 0.5519979861564934, |
|
"std_itl": 0.3518335787505597, |
|
"p99_itl": 1.7907296301797035, |
|
"mean_e2el": 35.35489678359931, |
|
"median_e2el": 34.07474009040743, |
|
"std_e2el": 6.129267594599042, |
|
"p99_e2el": 60.59256729653575 |
|
}, |
|
{ |
|
"hw": "gb200", |
|
"conc": 2048, |
|
"image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", |
|
"model": "/mnt/lustre01/models/deepseek-r1-0528-fp4-v2", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "dynamo-trt", |
|
"precision": "fp4", |
|
"spec_decoding": "none", |
|
"disagg": true, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": true, |
|
"prefill_tp": 4, |
|
"prefill_ep": 4, |
|
"prefill_dp_attention": "true", |
|
"prefill_num_workers": 2, |
|
"decode_tp": 16, |
|
"decode_ep": 16, |
|
"decode_dp_attention": "true", |
|
"decode_num_workers": 1, |
|
"num_prefill_gpu": 8, |
|
"num_decode_gpu": 16, |
|
"tput_per_gpu": 6197.878317247915, |
|
"output_tput_per_gpu": 4649.727217836739, |
|
"input_tput_per_gpu": 9294.180516070266, |
|
"mean_ttft": 1.5304020429895218, |
|
"median_ttft": 0.5354841455118731, |
|
"std_ttft": 3.0229475730882416, |
|
"p99_ttft": 14.324054803305772, |
|
"mean_tpot": 0.024446319285584682, |
|
"mean_intvty": 40.90595350236108, |
|
"median_tpot": 0.0244035563480784, |
|
"median_intvty": 40.977633986480114, |
|
"std_tpot": 0.00038243990509568583, |
|
"std_intvty": 2614.789896859225, |
|
"p99_tpot": 0.02536731993753382, |
|
"p99_intvty": 39.420798194782364, |
|
"mean_itl": 0.48334837704451544, |
|
"median_itl": 0.48330555483698845, |
|
"std_itl": 0.14492835540308474, |
|
"p99_itl": 0.8767040200764313, |
|
"mean_e2el": 24.053155971355096, |
|
"median_e2el": 23.445035883109085, |
|
"std_e2el": 3.2839838888592268, |
|
"p99_e2el": 36.94438854399603 |
|
}, |
|
{ |
|
"hw": "h200", |
|
"conc": 64, |
|
"image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2", |
|
"model": "deepseek-ai/DeepSeek-R1-0528", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "trt", |
|
"precision": "fp8", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 8, |
|
"ep": 8, |
|
"dp_attention": "false", |
|
"tput_per_gpu": 456.71277457392324, |
|
"output_tput_per_gpu": 228.30491853874665, |
|
"input_tput_per_gpu": 228.4078560351766, |
|
"mean_ttft": 0.9947605113498866, |
|
"median_ttft": 0.13977374229580164, |
|
"std_ttft": 2.71762373044683, |
|
"p99_ttft": 12.339342405293138, |
|
"mean_tpot": 0.033114762817580495, |
|
"mean_intvty": 30.198011850747847, |
|
"median_tpot": 0.03312615881115265, |
|
"median_intvty": 30.18762319231918, |
|
"std_tpot": 0.0016327572044284732, |
|
"std_intvty": 612.4609325181559, |
|
"p99_tpot": 0.039150759374931905, |
|
"p99_intvty": 25.542288731193715, |
|
"mean_itl": 0.32942854207200317, |
|
"median_itl": 0.30126926489174366, |
|
"std_itl": 0.11948468602180282, |
|
"p99_itl": 0.5422460364550352, |
|
"mean_e2el": 31.50653484758659, |
|
"median_e2el": 30.844364249147475, |
|
"std_e2el": 4.088792376153664, |
|
"p99_e2el": 45.05541437757201 |
|
}, |
|
{ |
|
"hw": "mi355x", |
|
"conc": 32, |
|
"image": "rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915", |
|
"model": "amd/DeepSeek-R1-0528-MXFP4-Preview", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "sglang", |
|
"precision": "fp4", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 4, |
|
"ep": 1, |
|
"dp_attention": "false", |
|
"tput_per_gpu": 499.07528254880015, |
|
"output_tput_per_gpu": 249.30997391797308, |
|
"input_tput_per_gpu": 249.76530863082706, |
|
"mean_ttft": 0.23429270649787212, |
|
"median_ttft": 0.13443481949798297, |
|
"std_ttft": 0.4981070310754062, |
|
"p99_ttft": 3.693676319500664, |
|
"mean_tpot": 0.0316819662921726, |
|
"mean_intvty": 31.563697492067018, |
|
"median_tpot": 0.031497857283734415, |
|
"median_intvty": 31.748191344952307, |
|
"std_tpot": 0.0013645471132767524, |
|
"std_intvty": 732.8438793136662, |
|
"p99_tpot": 0.03544520550343103, |
|
"p99_intvty": 28.212560367387397, |
|
"mean_itl": 0.0316887455722762, |
|
"median_itl": 0.02808956999797374, |
|
"std_itl": 0.020226387532098964, |
|
"p99_itl": 0.09138488282158501, |
|
"mean_e2el": 29.373262552226716, |
|
"median_e2el": 29.33710997349408, |
|
"std_e2el": 2.402231403875462, |
|
"p99_e2el": 35.906585603464045 |
|
}, |
|
{ |
|
"hw": "mi355x", |
|
"conc": 16, |
|
"image": "rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915", |
|
"model": "amd/DeepSeek-R1-0528-MXFP4-Preview", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "sglang", |
|
"precision": "fp4", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 4, |
|
"ep": 1, |
|
"dp_attention": "false", |
|
"tput_per_gpu": 306.0451682972859, |
|
"output_tput_per_gpu": 152.7598980530619, |
|
"input_tput_per_gpu": 153.285270244224, |
|
"mean_ttft": 0.1987650581939306, |
|
"median_ttft": 0.12773075599397998, |
|
"std_ttft": 0.26848966213290126, |
|
"p99_ttft": 1.9722618721981418, |
|
"mean_tpot": 0.025809103184290905, |
|
"mean_intvty": 38.74601890888889, |
|
"median_tpot": 0.02573446974101467, |
|
"median_intvty": 38.858387604786586, |
|
"std_tpot": 0.0008283913234227633, |
|
"std_intvty": 1207.1589497922075, |
|
"p99_tpot": 0.027493696731644646, |
|
"p99_intvty": 36.37197317481944, |
|
"mean_itl": 0.02581056827574398, |
|
"median_itl": 0.024163800000678748, |
|
"std_itl": 0.015743161240447973, |
|
"p99_itl": 0.08529472369060387, |
|
"mean_e2el": 23.935066793745317, |
|
"median_e2el": 24.009372471511597, |
|
"std_e2el": 1.7058797348005414, |
|
"p99_e2el": 27.448370655650503 |
|
}, |
|
{ |
|
"hw": "b200-trt", |
|
"conc": 32, |
|
"image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2", |
|
"model": "nvidia/DeepSeek-R1-0528-FP4-V2", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "trt", |
|
"precision": "fp4", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 4, |
|
"ep": 1, |
|
"dp_attention": "false", |
|
"tput_per_gpu": 897.4092021244418, |
|
"output_tput_per_gpu": 449.4154102966225, |
|
"input_tput_per_gpu": 447.9937918278193, |
|
"mean_ttft": 0.23239482318022056, |
|
"median_ttft": 0.09115337789990008, |
|
"std_ttft": 0.4398902793603599, |
|
"p99_ttft": 2.3007365879649297, |
|
"mean_tpot": 0.01717118986287973, |
|
"mean_intvty": 58.237082461115655, |
|
"median_tpot": 0.017297669240869083, |
|
"median_intvty": 57.81125688525174, |
|
"std_tpot": 0.0005474466058495037, |
|
"std_intvty": 1826.6621608663436, |
|
"p99_tpot": 0.01781354143178208, |
|
"p99_intvty": 56.13706874792719, |
|
"mean_itl": 0.17075729934448494, |
|
"median_itl": 0.15842054528184235, |
|
"std_itl": 0.03812959581703131, |
|
"p99_itl": 0.3030505743343383, |
|
"mean_e2el": 16.080806504502835, |
|
"median_e2el": 16.073315837420523, |
|
"std_e2el": 1.2719675075158121, |
|
"p99_e2el": 19.26560603603255 |
|
}, |
|
{ |
|
"hw": "b200", |
|
"conc": 4, |
|
"image": "lmsysorg/sglang:v0.5.5-cu129-amd64", |
|
"model": "deepseek-ai/DeepSeek-R1-0528", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "sglang", |
|
"precision": "fp8", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 8, |
|
"ep": 1, |
|
"dp_attention": "false", |
|
"tput_per_gpu": 111.19512760365721, |
|
"output_tput_per_gpu": 55.563619705252165, |
|
"input_tput_per_gpu": 55.63150789840505, |
|
"mean_ttft": 0.16721754746278747, |
|
"median_ttft": 0.1576263221213594, |
|
"std_ttft": 0.07806530207730589, |
|
"p99_ttft": 0.6859068887284957, |
|
"mean_tpot": 0.008764168852895795, |
|
"mean_intvty": 114.10095090415642, |
|
"median_tpot": 0.008774242714577298, |
|
"median_intvty": 113.96994960472499, |
|
"std_tpot": 0.00011516409095233161, |
|
"std_intvty": 8683.262219418006, |
|
"p99_tpot": 0.009054326213521549, |
|
"p99_intvty": 110.44444129996334, |
|
"mean_itl": 0.2586013330775246, |
|
"median_itl": 0.2543260745005682, |
|
"std_itl": 0.040273445530031114, |
|
"p99_itl": 0.35306476534111425, |
|
"mean_e2el": 8.227820904346881, |
|
"median_e2el": 8.210300613427535, |
|
"std_e2el": 0.5410468130695615, |
|
"p99_e2el": 9.210824889196083 |
|
}, |
|
{ |
|
"hw": "b200-trt", |
|
"conc": 4, |
|
"image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2", |
|
"model": "nvidia/DeepSeek-R1-0528-FP4-V2", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "trt", |
|
"precision": "fp4", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 4, |
|
"ep": 1, |
|
"dp_attention": "false", |
|
"tput_per_gpu": 229.93456671512357, |
|
"output_tput_per_gpu": 114.40785081455563, |
|
"input_tput_per_gpu": 115.52671590056794, |
|
"mean_ttft": 0.13291599152435082, |
|
"median_ttft": 0.10962579149054363, |
|
"std_ttft": 0.07607168375939898, |
|
"p99_ttft": 0.423498869598261, |
|
"mean_tpot": 0.008351382659914789, |
|
"mean_intvty": 119.74065142528187, |
|
"median_tpot": 0.008351485050254043, |
|
"median_intvty": 119.73918338865745, |
|
"std_tpot": 0.0001518442357067412, |
|
"std_intvty": 6585.696159920838, |
|
"p99_tpot": 0.008551696796468116, |
|
"p99_intvty": 116.93585773679486, |
|
"mean_itl": 0.08297909140369604, |
|
"median_itl": 0.08138331401278265, |
|
"std_itl": 0.014840220931810479, |
|
"p99_itl": 0.1608087609143695, |
|
"mean_e2el": 7.789811460523924, |
|
"median_e2el": 7.800381242501317, |
|
"std_e2el": 0.5647767090453408, |
|
"p99_e2el": 8.863045023245213 |
|
}, |
|
{ |
|
"hw": "b200", |
|
"conc": 128, |
|
"image": "lmsysorg/sglang:v0.5.5-cu129-amd64", |
|
"model": "nvidia/DeepSeek-R1-0528-FP4-V2", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "sglang", |
|
"precision": "fp4", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 4, |
|
"ep": 4, |
|
"dp_attention": "false", |
|
"tput_per_gpu": 1765.9572259169158, |
|
"output_tput_per_gpu": 882.0097961255168, |
|
"input_tput_per_gpu": 883.947429791399, |
|
"mean_ttft": 7.9145328841425, |
|
"median_ttft": 0.6534272480057552, |
|
"std_ttft": 22.013923569348933, |
|
"p99_ttft": 75.18259847427485, |
|
"mean_tpot": 0.026936163368016776, |
|
"mean_intvty": 37.12481196143068, |
|
"median_tpot": 0.02710567983928478, |
|
"median_intvty": 36.892636743634846, |
|
"std_tpot": 0.0024170482056075464, |
|
"std_intvty": 413.7277848575805, |
|
"p99_tpot": 0.028612954982715915, |
|
"p99_intvty": 34.94920397435585, |
|
"mean_itl": 0.26823513125919995, |
|
"median_itl": 0.234339507878758, |
|
"std_itl": 0.23480635432712735, |
|
"p99_itl": 0.5736454221210443, |
|
"mean_e2el": 32.682275038535224, |
|
"median_e2el": 25.61190285696648, |
|
"std_e2el": 22.249872590635338, |
|
"p99_e2el": 102.19253178208368 |
|
}, |
|
{ |
|
"hw": "gb200", |
|
"conc": 2048, |
|
"image": "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1", |
|
"model": "deepseek-ai/DeepSeek-R1", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "dynamo-sglang", |
|
"precision": "fp8", |
|
"spec_decoding": "none", |
|
"disagg": true, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": true, |
|
"prefill_tp": 1, |
|
"prefill_ep": 1, |
|
"prefill_dp_attention": "true", |
|
"prefill_num_workers": 3, |
|
"decode_tp": 1, |
|
"decode_ep": 1, |
|
"decode_dp_attention": "true", |
|
"decode_num_workers": 1, |
|
"num_prefill_gpu": 24, |
|
"num_decode_gpu": 48, |
|
"tput_per_gpu": 2254.58332699924, |
|
"output_tput_per_gpu": 1690.9872901005592, |
|
"input_tput_per_gpu": 3381.7754007966014, |
|
"mean_ttft": 2.214189830340456, |
|
"median_ttft": 1.055184896918945, |
|
"std_ttft": 2.408734280176251, |
|
"p99_ttft": 8.548036242406816, |
|
"mean_tpot": 0.021043439521847215, |
|
"mean_intvty": 47.52074863815889, |
|
"median_tpot": 0.021092488528798803, |
|
"median_intvty": 47.410242685904116, |
|
"std_tpot": 0.00032801274110838866, |
|
"std_intvty": 3048.6620630067523, |
|
"p99_tpot": 0.021615610726902403, |
|
"p99_intvty": 46.26286125496412, |
|
"mean_itl": 1.0251579416142114, |
|
"median_itl": 1.037793020484969, |
|
"std_itl": 0.1440633617716875, |
|
"p99_itl": 1.228154654381797, |
|
"mean_e2el": 21.594880694794846, |
|
"median_e2el": 21.071244221995585, |
|
"std_e2el": 2.6360387943642634, |
|
"p99_e2el": 28.974507761406716 |
|
}, |
|
{ |
|
"hw": "gb200", |
|
"conc": 4096, |
|
"image": "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1", |
|
"model": "deepseek-ai/DeepSeek-R1", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "dynamo-sglang", |
|
"precision": "fp8", |
|
"spec_decoding": "none", |
|
"disagg": true, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": true, |
|
"prefill_tp": 1, |
|
"prefill_ep": 1, |
|
"prefill_dp_attention": "true", |
|
"prefill_num_workers": 3, |
|
"decode_tp": 1, |
|
"decode_ep": 1, |
|
"decode_dp_attention": "true", |
|
"decode_num_workers": 1, |
|
"num_prefill_gpu": 24, |
|
"num_decode_gpu": 48, |
|
"tput_per_gpu": 3819.240348384917, |
|
"output_tput_per_gpu": 2865.058363518239, |
|
"input_tput_per_gpu": 5727.604318118273, |
|
"mean_ttft": 3.4467532725996763, |
|
"median_ttft": 1.2692942654248327, |
|
"std_ttft": 4.679917411303755, |
|
"p99_ttft": 17.442662313824986, |
|
"mean_tpot": 0.023834804036256062, |
|
"mean_intvty": 41.955452978713836, |
|
"median_tpot": 0.024033075363934803, |
|
"median_intvty": 41.60932318718762, |
|
"std_tpot": 0.0007782127221459672, |
|
"std_intvty": 1284.9956978889286, |
|
"p99_tpot": 0.025030093990295838, |
|
"p99_intvty": 39.95190750732697, |
|
"mean_itl": 1.160987070265976, |
|
"median_itl": 1.1759424770716578, |
|
"std_itl": 0.19865541049288465, |
|
"p99_itl": 1.7058381702192131, |
|
"mean_e2el": 25.40658549897133, |
|
"median_e2el": 23.887357637868263, |
|
"std_e2el": 4.934672434215348, |
|
"p99_e2el": 40.410193555047734 |
|
}, |
|
{ |
|
"hw": "gb200", |
|
"conc": 1024, |
|
"image": "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1", |
|
"model": "deepseek-ai/DeepSeek-R1", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "dynamo-sglang", |
|
"precision": "fp8", |
|
"spec_decoding": "none", |
|
"disagg": true, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": true, |
|
"prefill_tp": 1, |
|
"prefill_ep": 1, |
|
"prefill_dp_attention": "true", |
|
"prefill_num_workers": 3, |
|
"decode_tp": 1, |
|
"decode_ep": 1, |
|
"decode_dp_attention": "true", |
|
"decode_num_workers": 1, |
|
"num_prefill_gpu": 24, |
|
"num_decode_gpu": 48, |
|
"tput_per_gpu": 596.52577507071, |
|
"output_tput_per_gpu": 447.60049140073176, |
|
"input_tput_per_gpu": 894.3763424106663, |
|
"mean_ttft": 22.808323376209593, |
|
"median_ttft": 0.6381742671364918, |
|
"std_ttft": 37.989100830794655, |
|
"p99_ttft": 144.71110697851518, |
|
"mean_tpot": 0.019870145474881782, |
|
"mean_intvty": 50.32675786214643, |
|
"median_tpot": 0.019904019936956542, |
|
"median_intvty": 50.24110723197491, |
|
"std_tpot": 0.00030198810316582446, |
|
"std_intvty": 3311.388725306476, |
|
"p99_tpot": 0.02050075042601824, |
|
"p99_intvty": 48.778702204523405, |
|
"mean_itl": 0.9672413699327611, |
|
"median_itl": 0.9874429774936289, |
|
"std_itl": 0.13335430569667675, |
|
"p99_itl": 1.1014580716658382, |
|
"mean_e2el": 41.115633187459935, |
|
"median_e2el": 19.922846795874648, |
|
"std_e2el": 37.93415756708192, |
|
"p99_e2el": 163.28720403442392 |
|
}, |
|
{ |
|
"hw": "h200", |
|
"conc": 32, |
|
"image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2", |
|
"model": "deepseek-ai/DeepSeek-R1-0528", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "trt", |
|
"precision": "fp8", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 8, |
|
"ep": 8, |
|
"dp_attention": "false", |
|
"tput_per_gpu": 295.81460558317224, |
|
"output_tput_per_gpu": 148.14160811497888, |
|
"input_tput_per_gpu": 147.67299746819336, |
|
"mean_ttft": 0.47806689820135945, |
|
"median_ttft": 0.12522324442397803, |
|
"std_ttft": 1.0877645907867788, |
|
"p99_ttft": 4.820350355247501, |
|
"mean_tpot": 0.025896730488496424, |
|
"mean_intvty": 38.614913200884935, |
|
"median_tpot": 0.025927905091378087, |
|
"median_intvty": 38.5684842826941, |
|
"std_tpot": 0.0008744587659171604, |
|
"std_intvty": 1143.5644983799414, |
|
"p99_tpot": 0.027313797021867186, |
|
"p99_intvty": 36.61153369483594, |
|
"mean_itl": 0.25752916184175595, |
|
"median_itl": 0.24227966740727425, |
|
"std_itl": 0.05412672899346776, |
|
"p99_itl": 0.39470253719249737, |
|
"mean_e2el": 24.37999208016263, |
|
"median_e2el": 24.36240943497978, |
|
"std_e2el": 2.1026897143543195, |
|
"p99_e2el": 30.27027516528964 |
|
}, |
|
{ |
|
"hw": "b200-trt", |
|
"conc": 8, |
|
"image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2", |
|
"model": "deepseek-ai/DeepSeek-R1-0528", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "trt", |
|
"precision": "fp8", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 8, |
|
"ep": 8, |
|
"dp_attention": "false", |
|
"tput_per_gpu": 123.9594290912726, |
|
"output_tput_per_gpu": 62.20946588953585, |
|
"input_tput_per_gpu": 61.74996320173675, |
|
"mean_ttft": 0.20791010695975273, |
|
"median_ttft": 0.13392320391722023, |
|
"std_ttft": 0.23873498713961971, |
|
"p99_ttft": 1.2763758396357294, |
|
"mean_tpot": 0.015510140992766677, |
|
"mean_intvty": 64.47394646292132, |
|
"median_tpot": 0.015545382547207966, |
|
"median_intvty": 64.32778331206815, |
|
"std_tpot": 0.0002790458830966927, |
|
"std_intvty": 3583.6400412095963, |
|
"p99_tpot": 0.016258878910921846, |
|
"p99_intvty": 61.50485562250257, |
|
"mean_itl": 0.15420951023277643, |
|
"median_itl": 0.149389476981014, |
|
"std_itl": 0.030909557339096977, |
|
"p99_itl": 0.2470420343056321, |
|
"mean_e2el": 14.60529533646186, |
|
"median_e2el": 14.643656376749277, |
|
"std_e2el": 1.0323764703938179, |
|
"p99_e2el": 16.53920134677086 |
|
}, |
|
{ |
|
"hw": "mi325x", |
|
"conc": 4, |
|
"image": "rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915", |
|
"model": "deepseek-ai/DeepSeek-R1-0528", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "sglang", |
|
"precision": "fp8", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 8, |
|
"ep": 1, |
|
"dp_attention": "false", |
|
"tput_per_gpu": 44.127972793849686, |
|
"output_tput_per_gpu": 21.95662270476474, |
|
"input_tput_per_gpu": 22.171350089084946, |
|
"mean_ttft": 0.18732226876018104, |
|
"median_ttft": 0.13417264650342986, |
|
"std_ttft": 0.1871872933414264, |
|
"p99_ttft": 0.8453631609748118, |
|
"mean_tpot": 0.02184903528048016, |
|
"mean_intvty": 45.768611161216626, |
|
"median_tpot": 0.021843753863896573, |
|
"median_intvty": 45.77967716678969, |
|
"std_tpot": 0.00016489244187872643, |
|
"std_intvty": 6064.559349151193, |
|
"p99_tpot": 0.02233645603919966, |
|
"p99_intvty": 44.76985956254819, |
|
"mean_itl": 0.02184876543590368, |
|
"median_itl": 0.021517455927096307, |
|
"std_itl": 0.005299868380454074, |
|
"p99_itl": 0.024133532755076886, |
|
"mean_e2el": 20.2166304375045, |
|
"median_e2el": 20.32324968004832, |
|
"std_e2el": 1.4052141974331072, |
|
"p99_e2el": 22.871388312890193 |
|
}, |
|
{ |
|
"hw": "mi355x", |
|
"conc": 8, |
|
"image": "rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915", |
|
"model": "amd/DeepSeek-R1-0528-MXFP4-Preview", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "sglang", |
|
"precision": "fp4", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 8, |
|
"ep": 1, |
|
"dp_attention": "false", |
|
"tput_per_gpu": 140.00698736060681, |
|
"output_tput_per_gpu": 70.19094865775841, |
|
"input_tput_per_gpu": 69.8160387028484, |
|
"mean_ttft": 0.18259024413724545, |
|
"median_ttft": 0.1001861539989477, |
|
"std_ttft": 0.17596723566911493, |
|
"p99_ttft": 1.0825899649425992, |
|
"mean_tpot": 0.013947495535800778, |
|
"mean_intvty": 71.69745976495746, |
|
"median_tpot": 0.013855465332991, |
|
"median_intvty": 72.17368568769162, |
|
"std_tpot": 0.0003825080453294319, |
|
"std_intvty": 2614.324096474254, |
|
"p99_tpot": 0.015049243699052927, |
|
"p99_intvty": 66.44852193223049, |
|
"mean_itl": 0.0139489930462004, |
|
"median_itl": 0.01299674999609124, |
|
"std_itl": 0.01347560963089915, |
|
"p99_itl": 0.017627689233922498, |
|
"mean_e2el": 13.089244616760334, |
|
"median_e2el": 13.033233756999834, |
|
"std_e2el": 0.9139494301676425, |
|
"p99_e2el": 15.35890747389174 |
|
}, |
|
{ |
|
"hw": "b200-trt", |
|
"conc": 8, |
|
"image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2", |
|
"model": "nvidia/DeepSeek-R1-0528-FP4-V2", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "trt", |
|
"precision": "fp4", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 8, |
|
"ep": 1, |
|
"dp_attention": "false", |
|
"tput_per_gpu": 220.9558926044314, |
|
"output_tput_per_gpu": 110.88747475552128, |
|
"input_tput_per_gpu": 110.06841784891012, |
|
"mean_ttft": 0.12296479816432111, |
|
"median_ttft": 0.08433223608881235, |
|
"std_ttft": 0.12065941863024443, |
|
"p99_ttft": 0.650510452305898, |
|
"mean_tpot": 0.008710679725442714, |
|
"mean_intvty": 114.8016034935983, |
|
"median_tpot": 0.008731782431272827, |
|
"median_intvty": 114.52415447486483, |
|
"std_tpot": 0.0001848908743781026, |
|
"std_intvty": 5408.595764196538, |
|
"p99_tpot": 0.008956694505365716, |
|
"p99_intvty": 111.64833180376162, |
|
"mean_itl": 0.0866201442164717, |
|
"median_itl": 0.08309970516711473, |
|
"std_itl": 0.01830391695428319, |
|
"p99_itl": 0.147244536653161, |
|
"mean_e2el": 8.210037848516368, |
|
"median_e2el": 8.292171580484137, |
|
"std_e2el": 0.5918520767172697, |
|
"p99_e2el": 9.20223142792005 |
|
}, |
|
{ |
|
"hw": "h200", |
|
"conc": 64, |
|
"image": "lmsysorg/sglang:v0.5.5-cu129-amd64", |
|
"model": "deepseek-ai/DeepSeek-R1-0528", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "sglang", |
|
"precision": "fp8", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 8, |
|
"ep": 1, |
|
"dp_attention": "false", |
|
"tput_per_gpu": 492.19327296594696, |
|
"output_tput_per_gpu": 246.04116929867362, |
|
"input_tput_per_gpu": 246.15210366727334, |
|
"mean_ttft": 0.3294624885340454, |
|
"median_ttft": 0.14435125421732664, |
|
"std_ttft": 0.5078944623912816, |
|
"p99_ttft": 2.109193651378155, |
|
"mean_tpot": 0.03143482613417446, |
|
"mean_intvty": 31.811850834856287, |
|
"median_tpot": 0.03206727878971416, |
|
"median_intvty": 31.18443590295408, |
|
"std_tpot": 0.0018697724283459685, |
|
"std_intvty": 534.8244443226797, |
|
"p99_tpot": 0.033419055362707145, |
|
"p99_intvty": 29.923048067837243, |
|
"mean_itl": 0.3130929929381459, |
|
"median_itl": 0.2652675621211529, |
|
"std_itl": 0.09325036784693132, |
|
"p99_itl": 0.6344047609344124, |
|
"mean_e2el": 29.297902333908134, |
|
"median_e2el": 29.55287105590105, |
|
"std_e2el": 2.662894988399935, |
|
"p99_e2el": 33.959484479147946 |
|
}, |
|
{ |
|
"hw": "b200", |
|
"conc": 16, |
|
"image": "lmsysorg/sglang:v0.5.5-cu129-amd64", |
|
"model": "deepseek-ai/DeepSeek-R1-0528", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "sglang", |
|
"precision": "fp8", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 8, |
|
"ep": 1, |
|
"dp_attention": "false", |
|
"tput_per_gpu": 282.93776002462636, |
|
"output_tput_per_gpu": 141.22602757361327, |
|
"input_tput_per_gpu": 141.7117324510131, |
|
"mean_ttft": 0.3285720966418739, |
|
"median_ttft": 0.30267622182145715, |
|
"std_ttft": 0.18123222154615, |
|
"p99_ttft": 1.3213041639281438, |
|
"mean_tpot": 0.01369717992996946, |
|
"mean_intvty": 73.00772897142117, |
|
"median_tpot": 0.013752427774858886, |
|
"median_intvty": 72.71443387095054, |
|
"std_tpot": 0.0003402537420072122, |
|
"std_intvty": 2938.9831074328154, |
|
"p99_tpot": 0.014221395040691245, |
|
"p99_intvty": 70.31658969733492, |
|
"mean_itl": 0.4046281688127338, |
|
"median_itl": 0.38193408865481615, |
|
"std_itl": 0.0653815429299919, |
|
"p99_itl": 0.5050839721411464, |
|
"mean_e2el": 12.926164196388562, |
|
"median_e2el": 12.94644465087913, |
|
"std_e2el": 0.8653985453691946, |
|
"p99_e2el": 14.583644766402431 |
|
}, |
|
{ |
|
"hw": "b200", |
|
"conc": 128, |
|
"image": "lmsysorg/sglang:v0.5.5-cu129-amd64", |
|
"model": "nvidia/DeepSeek-R1-0528-FP4-V2", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "sglang", |
|
"precision": "fp4", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 8, |
|
"ep": 8, |
|
"dp_attention": "false", |
|
"tput_per_gpu": 1031.8263749958364, |
|
"output_tput_per_gpu": 515.3471201288461, |
|
"input_tput_per_gpu": 516.4792548669903, |
|
"mean_ttft": 8.047447742966323, |
|
"median_ttft": 0.5291580425109714, |
|
"std_ttft": 22.739150392137557, |
|
"p99_ttft": 77.40256422179519, |
|
"mean_tpot": 0.02167746646369556, |
|
"mean_intvty": 46.13085213047174, |
|
"median_tpot": 0.02176623053086376, |
|
"median_intvty": 45.94272759272832, |
|
"std_tpot": 0.0026811437126168193, |
|
"std_intvty": 372.975158061927, |
|
"p99_tpot": 0.023366678003612758, |
|
"p99_intvty": 42.79598494254889, |
|
"mean_itl": 0.21577838987195336, |
|
"median_itl": 0.18363446393050253, |
|
"std_itl": 0.23810750019698657, |
|
"p99_itl": 0.537630925734993, |
|
"mean_e2el": 27.971547497398934, |
|
"median_e2el": 20.622131744516082, |
|
"std_e2el": 22.92224569320669, |
|
"p99_e2el": 99.02907122727484 |
|
}, |
|
{ |
|
"hw": "mi355x", |
|
"conc": 16, |
|
"image": "rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915", |
|
"model": "deepseek-ai/DeepSeek-R1-0528", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "sglang", |
|
"precision": "fp8", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 8, |
|
"ep": 1, |
|
"dp_attention": "false", |
|
"tput_per_gpu": 175.95277232368952, |
|
"output_tput_per_gpu": 87.82536156954185, |
|
"input_tput_per_gpu": 88.12741075414766, |
|
"mean_ttft": 0.11519924342464946, |
|
"median_ttft": 0.10491188800369855, |
|
"std_ttft": 0.06769445317163822, |
|
"p99_ttft": 0.6158818911685375, |
|
"mean_tpot": 0.02244746672902508, |
|
"mean_intvty": 44.54845671770069, |
|
"median_tpot": 0.02245250594216652, |
|
"median_intvty": 44.53845831622599, |
|
"std_tpot": 0.000301557232503499, |
|
"std_intvty": 3316.1200999826683, |
|
"p99_tpot": 0.023019226943117633, |
|
"p99_intvty": 43.44194539942982, |
|
"mean_itl": 0.02244832270104978, |
|
"median_itl": 0.021457998504047282, |
|
"std_itl": 0.0063888792957696615, |
|
"p99_itl": 0.06889111073454844, |
|
"mean_e2el": 20.75946228062159, |
|
"median_e2el": 20.800773798502632, |
|
"std_e2el": 1.3378757943128958, |
|
"p99_e2el": 23.093859420334628 |
|
}, |
|
{ |
|
"hw": "b200-trt", |
|
"conc": 16, |
|
"image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2", |
|
"model": "nvidia/DeepSeek-R1-0528-FP4-V2", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "trt", |
|
"precision": "fp4", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 4, |
|
"ep": 1, |
|
"dp_attention": "false", |
|
"tput_per_gpu": 542.1618666266606, |
|
"output_tput_per_gpu": 269.6456982992158, |
|
"input_tput_per_gpu": 272.51616832744486, |
|
"mean_ttft": 0.19474708190791717, |
|
"median_ttft": 0.11112058749131393, |
|
"std_ttft": 0.26386422530551684, |
|
"p99_ttft": 1.4121461816126244, |
|
"mean_tpot": 0.014355811529131202, |
|
"mean_intvty": 69.6582006507102, |
|
"median_tpot": 0.014387425535050439, |
|
"median_intvty": 69.50513818916487, |
|
"std_tpot": 0.0005065612734389114, |
|
"std_intvty": 1974.094847818237, |
|
"p99_tpot": 0.0150497070429755, |
|
"p99_intvty": 66.44647614365047, |
|
"mean_itl": 0.14276671962343918, |
|
"median_itl": 0.13329885000712238, |
|
"std_itl": 0.03508060589322459, |
|
"p99_itl": 0.2761659569863696, |
|
"mean_e2el": 13.333746583119318, |
|
"median_e2el": 13.328474082998582, |
|
"std_e2el": 1.0801044653636331, |
|
"p99_e2el": 15.695636622975579 |
|
}, |
|
{ |
|
"hw": "mi355x", |
|
"conc": 64, |
|
"image": "rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915", |
|
"model": "amd/DeepSeek-R1-0528-MXFP4-Preview", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "sglang", |
|
"precision": "fp4", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 8, |
|
"ep": 1, |
|
"dp_attention": "false", |
|
"tput_per_gpu": 620.9878513301078, |
|
"output_tput_per_gpu": 310.5995673004801, |
|
"input_tput_per_gpu": 310.3882840296277, |
|
"mean_ttft": 0.19784741542786832, |
|
"median_ttft": 0.1088224050035933, |
|
"std_ttft": 0.3966024749883867, |
|
"p99_ttft": 2.681221848351415, |
|
"mean_tpot": 0.025376081027359058, |
|
"mean_intvty": 39.407188167544724, |
|
"median_tpot": 0.024855360171440733, |
|
"median_intvty": 40.23277044076064, |
|
"std_tpot": 0.0017309797317995841, |
|
"std_intvty": 577.7075153620468, |
|
"p99_tpot": 0.03262463249501408, |
|
"p99_intvty": 30.651686272721292, |
|
"mean_itl": 0.025385707519216687, |
|
"median_itl": 0.019969038985436782, |
|
"std_itl": 0.026318395158356062, |
|
"p99_itl": 0.09190794019959866, |
|
"mean_e2el": 23.575164543636063, |
|
"median_e2el": 23.38487718749093, |
|
"std_e2el": 2.420801068537152, |
|
"p99_e2el": 32.063513461248625 |
|
}, |
|
{ |
|
"hw": "b200", |
|
"conc": 64, |
|
"image": "lmsysorg/sglang:v0.5.5-cu129-amd64", |
|
"model": "nvidia/DeepSeek-R1-0528-FP4-V2", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "sglang", |
|
"precision": "fp4", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 8, |
|
"ep": 8, |
|
"dp_attention": "false", |
|
"tput_per_gpu": 634.0067710756045, |
|
"output_tput_per_gpu": 316.9319368359403, |
|
"input_tput_per_gpu": 317.07483423966426, |
|
"mean_ttft": 7.22376137591832, |
|
"median_ttft": 0.37949243550247047, |
|
"std_ttft": 20.740413428655884, |
|
"p99_ttft": 70.49232543275575, |
|
"mean_tpot": 0.016875033929500426, |
|
"mean_intvty": 59.259140110635876, |
|
"median_tpot": 0.01690659974814581, |
|
"median_intvty": 59.14849910075339, |
|
"std_tpot": 0.0031217040798108392, |
|
"std_intvty": 320.33785856492693, |
|
"p99_tpot": 0.018047388284074196, |
|
"p99_intvty": 55.4096794649475, |
|
"mean_itl": 0.16803487778019416, |
|
"median_itl": 0.1487200039846357, |
|
"std_itl": 0.29149919278814873, |
|
"p99_itl": 0.47566765745752493, |
|
"mean_e2el": 22.770925787576562, |
|
"median_e2el": 15.963349754514638, |
|
"std_e2el": 21.0956728233861, |
|
"p99_e2el": 87.7593521033763 |
|
}, |
|
{ |
|
"hw": "b200", |
|
"conc": 8, |
|
"image": "lmsysorg/sglang:v0.5.5-cu129-amd64", |
|
"model": "deepseek-ai/DeepSeek-R1-0528", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "sglang", |
|
"precision": "fp8", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 8, |
|
"ep": 1, |
|
"dp_attention": "false", |
|
"tput_per_gpu": 182.06220418264795, |
|
"output_tput_per_gpu": 91.27486468506198, |
|
"input_tput_per_gpu": 90.78733949758596, |
|
"mean_ttft": 0.16588181990533485, |
|
"median_ttft": 0.14928729851089884, |
|
"std_ttft": 0.09467247359957626, |
|
"p99_ttft": 0.793000138529169, |
|
"mean_tpot": 0.010705423357162165, |
|
"mean_intvty": 93.41059822085205, |
|
"median_tpot": 0.010717351393897706, |
|
"median_intvty": 93.30663549665681, |
|
"std_tpot": 0.00014529555709801375, |
|
"std_intvty": 6882.522906914614, |
|
"p99_tpot": 0.010999663943066394, |
|
"p99_intvty": 90.91186832397248, |
|
"mean_itl": 0.316213313779033, |
|
"median_itl": 0.30522731100791134, |
|
"std_itl": 0.04941966283408154, |
|
"p99_itl": 0.4606577536062105, |
|
"mean_e2el": 10.072054259582728, |
|
"median_e2el": 10.05200670848717, |
|
"std_e2el": 0.6347270474702321, |
|
"p99_e2el": 11.211421411118353 |
|
}, |
|
{ |
|
"hw": "mi300x", |
|
"conc": 64, |
|
"image": "rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915", |
|
"model": "deepseek-ai/DeepSeek-R1-0528", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "sglang", |
|
"precision": "fp8", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 8, |
|
"ep": 1, |
|
"dp_attention": "false", |
|
"tput_per_gpu": 371.29733532210605, |
|
"output_tput_per_gpu": 185.60682471264326, |
|
"input_tput_per_gpu": 185.6905106094628, |
|
"mean_ttft": 0.5166783945925999, |
|
"median_ttft": 0.1663797835353762, |
|
"std_ttft": 1.0318921658323659, |
|
"p99_ttft": 4.326555775185116, |
|
"mean_tpot": 0.04136769045838726, |
|
"mean_intvty": 24.173454909355495, |
|
"median_tpot": 0.04220044935812309, |
|
"median_intvty": 23.69643013783482, |
|
"std_tpot": 0.0024437828806327822, |
|
"std_intvty": 409.20165532097695, |
|
"p99_tpot": 0.04392209482570136, |
|
"p99_intvty": 22.76758437793914, |
|
"mean_itl": 0.04139217279887544, |
|
"median_itl": 0.03621176490560174, |
|
"std_itl": 0.026748115937057938, |
|
"p99_itl": 0.13387840744107965, |
|
"mean_e2el": 38.63117301587918, |
|
"median_e2el": 38.84609429119155, |
|
"std_e2el": 3.262300791385069, |
|
"p99_e2el": 44.6767261645291 |
|
}, |
|
{ |
|
"hw": "gb200", |
|
"conc": 512, |
|
"image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", |
|
"model": "/mnt/lustre01/models/deepseek-r1-0528-fp4-v2", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "dynamo-trt", |
|
"precision": "fp4", |
|
"spec_decoding": "mtp", |
|
"disagg": true, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": true, |
|
"prefill_tp": 4, |
|
"prefill_ep": 4, |
|
"prefill_dp_attention": "true", |
|
"prefill_num_workers": 1, |
|
"decode_tp": 32, |
|
"decode_ep": 32, |
|
"decode_dp_attention": "true", |
|
"decode_num_workers": 1, |
|
"num_prefill_gpu": 4, |
|
"num_decode_gpu": 32, |
|
"tput_per_gpu": 2412.1309327995714, |
|
"output_tput_per_gpu": 1357.0583904820062, |
|
"input_tput_per_gpu": 10852.711271340095, |
|
"mean_ttft": 1.2348787148280849, |
|
"median_ttft": 0.4148194439476356, |
|
"std_ttft": 2.200366709393639, |
|
"p99_ttft": 9.825874333607498, |
|
"mean_tpot": 0.009670138642223663, |
|
"mean_intvty": 103.41113369704993, |
|
"median_tpot": 0.009650568658963625, |
|
"median_intvty": 103.62083679609717, |
|
"std_tpot": 0.0011122349812344497, |
|
"std_intvty": 899.0905850579505, |
|
"p99_tpot": 0.012485798162104612, |
|
"p99_intvty": 80.09099514639595, |
|
"mean_itl": 0.4434240914943136, |
|
"median_itl": 0.44113103090785444, |
|
"std_itl": 0.12849161349339594, |
|
"p99_itl": 0.6888551347143952, |
|
"mean_e2el": 10.132914845539517, |
|
"median_e2el": 9.352651417022571, |
|
"std_e2el": 2.837751493419591, |
|
"p99_e2el": 20.733681057707873 |
|
}, |
|
{ |
|
"hw": "mi325x", |
|
"conc": 64, |
|
"image": "rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915", |
|
"model": "deepseek-ai/DeepSeek-R1-0528", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "sglang", |
|
"precision": "fp8", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 8, |
|
"ep": 1, |
|
"dp_attention": "false", |
|
"tput_per_gpu": 411.88752912578764, |
|
"output_tput_per_gpu": 205.8973473468454, |
|
"input_tput_per_gpu": 205.99018177894223, |
|
"mean_ttft": 0.4331379817704146, |
|
"median_ttft": 0.14366331044584513, |
|
"std_ttft": 0.8443757891982799, |
|
"p99_ttft": 3.6977680043829606, |
|
"mean_tpot": 0.037211900335084536, |
|
"mean_intvty": 26.87312367805008, |
|
"median_tpot": 0.03779024314733602, |
|
"median_intvty": 26.46185673114659, |
|
"std_tpot": 0.001843556770021934, |
|
"std_intvty": 542.4297294561221, |
|
"p99_tpot": 0.03946798171097013, |
|
"p99_intvty": 25.336993599600508, |
|
"mean_itl": 0.037236205271839656, |
|
"median_itl": 0.032972196117043495, |
|
"std_itl": 0.019487043197584968, |
|
"p99_itl": 0.11140763387084021, |
|
"mean_e2el": 34.72075919606213, |
|
"median_e2el": 34.903472155099735, |
|
"std_e2el": 2.8688129703284284, |
|
"p99_e2el": 40.089818246290086 |
|
}, |
|
{ |
|
"hw": "b200-trt", |
|
"conc": 16, |
|
"image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2", |
|
"model": "nvidia/DeepSeek-R1-0528-FP4-V2", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "trt", |
|
"precision": "fp4", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 8, |
|
"ep": 8, |
|
"dp_attention": "false", |
|
"tput_per_gpu": 316.15706593132177, |
|
"output_tput_per_gpu": 157.2415879149002, |
|
"input_tput_per_gpu": 158.91547801642156, |
|
"mean_ttft": 0.19592693944960046, |
|
"median_ttft": 0.11224943700653967, |
|
"std_ttft": 0.27968003475038467, |
|
"p99_ttft": 1.481824649146001, |
|
"mean_tpot": 0.012260266909942353, |
|
"mean_intvty": 81.5642927960287, |
|
"median_tpot": 0.012354373190594074, |
|
"median_intvty": 80.94299763919581, |
|
"std_tpot": 0.00044672944415771537, |
|
"std_intvty": 2238.4913577511034, |
|
"p99_tpot": 0.012946660824161195, |
|
"p99_intvty": 77.23999366182433, |
|
"mean_itl": 0.12192719178198376, |
|
"median_itl": 0.11135585900046863, |
|
"std_itl": 0.036943895333217014, |
|
"p99_itl": 0.2694482011673972, |
|
"mean_e2el": 11.417038633462472, |
|
"median_e2el": 11.404057557490887, |
|
"std_e2el": 0.9445595875237522, |
|
"p99_e2el": 13.646970738812815 |
|
}, |
|
{ |
|
"hw": "b200-trt", |
|
"conc": 4, |
|
"image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2", |
|
"model": "deepseek-ai/DeepSeek-R1-0528", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "trt", |
|
"precision": "fp8", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 8, |
|
"ep": 8, |
|
"dp_attention": "false", |
|
"tput_per_gpu": 65.83950111446778, |
|
"output_tput_per_gpu": 32.75956255216392, |
|
"input_tput_per_gpu": 33.07993856230386, |
|
"mean_ttft": 0.2134733396473166, |
|
"median_ttft": 0.16361602349206805, |
|
"std_ttft": 0.15198227124610095, |
|
"p99_ttft": 0.786172540151747, |
|
"mean_tpot": 0.014585688703470318, |
|
"mean_intvty": 68.56035531336096, |
|
"median_tpot": 0.014609090837043578, |
|
"median_intvty": 68.45052927348138, |
|
"std_tpot": 0.00021310625289578373, |
|
"std_intvty": 4692.494877140158, |
|
"p99_tpot": 0.014859207445867465, |
|
"p99_intvty": 67.29834034843579, |
|
"mean_itl": 0.1449061915521784, |
|
"median_itl": 0.1421968379872851, |
|
"std_itl": 0.024724391368815397, |
|
"p99_itl": 0.2652863415918546, |
|
"mean_e2el": 13.584691922573256, |
|
"median_e2el": 13.677898031499353, |
|
"std_e2el": 0.9525622134625409, |
|
"p99_e2el": 15.508266156367027 |
|
}, |
|
{ |
|
"hw": "h200", |
|
"conc": 4, |
|
"image": "lmsysorg/sglang:v0.5.5-cu129-amd64", |
|
"model": "deepseek-ai/DeepSeek-R1-0528", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "sglang", |
|
"precision": "fp8", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 8, |
|
"ep": 1, |
|
"dp_attention": "false", |
|
"tput_per_gpu": 82.20657950322091, |
|
"output_tput_per_gpu": 40.90328052080915, |
|
"input_tput_per_gpu": 41.30329898241176, |
|
"mean_ttft": 0.19577070961240678, |
|
"median_ttft": 0.1558728776872158, |
|
"std_ttft": 0.11544510834579869, |
|
"p99_ttft": 0.5914431960135699, |
|
"mean_tpot": 0.011667913389199234, |
|
"mean_intvty": 85.70512709887623, |
|
"median_tpot": 0.011701949067385746, |
|
"median_intvty": 85.45584964021752, |
|
"std_tpot": 0.00023957319327288688, |
|
"std_intvty": 4174.08970652633, |
|
"p99_tpot": 0.011933998961290833, |
|
"p99_intvty": 83.79420873452429, |
|
"mean_itl": 0.11614621101057483, |
|
"median_itl": 0.11302625387907028, |
|
"std_itl": 0.02490280305721305, |
|
"p99_itl": 0.24921410814858974, |
|
"mean_e2el": 10.892836503894069, |
|
"median_e2el": 10.968858200125396, |
|
"std_e2el": 0.7894751679459576, |
|
"p99_e2el": 12.335861399509012 |
|
}, |
|
{ |
|
"hw": "mi300x", |
|
"conc": 4, |
|
"image": "rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915", |
|
"model": "deepseek-ai/DeepSeek-R1-0528", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "sglang", |
|
"precision": "fp8", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 8, |
|
"ep": 1, |
|
"dp_attention": "false", |
|
"tput_per_gpu": 44.84628442114787, |
|
"output_tput_per_gpu": 22.314030860782047, |
|
"input_tput_per_gpu": 22.532253560365824, |
|
"mean_ttft": 0.2565912339836359, |
|
"median_ttft": 0.1764916297979653, |
|
"std_ttft": 0.28480466485570477, |
|
"p99_ttft": 1.2572721256315709, |
|
"mean_tpot": 0.02142861385915102, |
|
"mean_intvty": 46.6665742624763, |
|
"median_tpot": 0.021444322848459846, |
|
"median_intvty": 46.632388770989856, |
|
"std_tpot": 0.00022878034763899524, |
|
"std_intvty": 4371.004810159453, |
|
"p99_tpot": 0.02210502082906931, |
|
"p99_intvty": 45.23859116590134, |
|
"mean_itl": 0.021428176971848484, |
|
"median_itl": 0.02100955881178379, |
|
"std_itl": 0.008492415864153207, |
|
"p99_itl": 0.021389827877283097, |
|
"mean_e2el": 19.900336326356047, |
|
"median_e2el": 20.007334575988352, |
|
"std_e2el": 1.4051264818491935, |
|
"p99_e2el": 22.784044573558493 |
|
}, |
|
{ |
|
"hw": "mi355x", |
|
"conc": 32, |
|
"image": "rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915", |
|
"model": "deepseek-ai/DeepSeek-R1-0528", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "sglang", |
|
"precision": "fp8", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 8, |
|
"ep": 1, |
|
"dp_attention": "false", |
|
"tput_per_gpu": 296.6379693181095, |
|
"output_tput_per_gpu": 148.18366482925754, |
|
"input_tput_per_gpu": 148.454304488852, |
|
"mean_ttft": 0.13541241095852455, |
|
"median_ttft": 0.11120598450361285, |
|
"std_ttft": 0.14229346180085098, |
|
"p99_ttft": 1.139467287303123, |
|
"mean_tpot": 0.026669320744572032, |
|
"mean_intvty": 37.49626807437638, |
|
"median_tpot": 0.026734878875288692, |
|
"median_intvty": 37.404321323643984, |
|
"std_tpot": 0.0003854001607907831, |
|
"std_intvty": 2594.705715607774, |
|
"p99_tpot": 0.027109898874429227, |
|
"p99_intvty": 36.886895249293104, |
|
"mean_itl": 0.02667614534533136, |
|
"median_itl": 0.024693420011317357, |
|
"std_itl": 0.009555510403225349, |
|
"p99_itl": 0.07460135300061665, |
|
"mean_e2el": 24.661993785221583, |
|
"median_e2el": 24.6352058104967, |
|
"std_e2el": 1.664611496660449, |
|
"p99_e2el": 27.556570331401307 |
|
}, |
|
{ |
|
"hw": "mi355x", |
|
"conc": 32, |
|
"image": "rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915", |
|
"model": "amd/DeepSeek-R1-0528-MXFP4-Preview", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "sglang", |
|
"precision": "fp4", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 8, |
|
"ep": 1, |
|
"dp_attention": "false", |
|
"tput_per_gpu": 403.98555109201277, |
|
"output_tput_per_gpu": 201.80848606971315, |
|
"input_tput_per_gpu": 202.17706502229962, |
|
"mean_ttft": 0.16468010068520017, |
|
"median_ttft": 0.10207451799942646, |
|
"std_ttft": 0.22698254128855616, |
|
"p99_ttft": 1.6246460357803152, |
|
"mean_tpot": 0.01951736207539238, |
|
"mean_intvty": 51.23643226667433, |
|
"median_tpot": 0.019135252145820007, |
|
"median_intvty": 52.25956744021503, |
|
"std_tpot": 0.0010971724550971225, |
|
"std_intvty": 911.4337453098741, |
|
"p99_tpot": 0.023190558551986772, |
|
"p99_intvty": 43.120996752116966, |
|
"mean_itl": 0.019526418013656087, |
|
"median_itl": 0.01661322099971585, |
|
"std_itl": 0.02090876477857112, |
|
"p99_itl": 0.08577526139852125, |
|
"mean_e2el": 18.119941349035706, |
|
"median_e2el": 17.978715368997655, |
|
"std_e2el": 1.704819452919009, |
|
"p99_e2el": 23.524846079876298 |
|
}, |
|
{ |
|
"hw": "mi355x", |
|
"conc": 4, |
|
"image": "rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915", |
|
"model": "amd/DeepSeek-R1-0528-MXFP4-Preview", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "sglang", |
|
"precision": "fp4", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 8, |
|
"ep": 1, |
|
"dp_attention": "false", |
|
"tput_per_gpu": 75.55383157238532, |
|
"output_tput_per_gpu": 37.753851767014105, |
|
"input_tput_per_gpu": 37.799979805371215, |
|
"mean_ttft": 0.1950170337059535, |
|
"median_ttft": 0.18358874150726479, |
|
"std_ttft": 0.17356631750535162, |
|
"p99_ttft": 0.8757589224123464, |
|
"mean_tpot": 0.012947859907705438, |
|
"mean_intvty": 77.23284057196875, |
|
"median_tpot": 0.012902648840497953, |
|
"median_intvty": 77.50346555672105, |
|
"std_tpot": 0.0002587035934702906, |
|
"std_intvty": 3865.4275597251785, |
|
"p99_tpot": 0.013931271001429813, |
|
"p99_intvty": 71.78095953322325, |
|
"mean_itl": 0.012948304819368695, |
|
"median_itl": 0.012470546003896743, |
|
"std_itl": 0.010048984615135668, |
|
"p99_itl": 0.012806082313181836, |
|
"mean_e2el": 12.104414425215946, |
|
"median_e2el": 12.07736179100175, |
|
"std_e2el": 0.863903277527204, |
|
"p99_e2el": 13.789736369127054 |
|
}, |
|
{ |
|
"hw": "b200-trt", |
|
"conc": 64, |
|
"image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2", |
|
"model": "nvidia/DeepSeek-R1-0528-FP4-V2", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "trt", |
|
"precision": "fp4", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 8, |
|
"ep": 8, |
|
"dp_attention": "false", |
|
"tput_per_gpu": 853.7762593914858, |
|
"output_tput_per_gpu": 426.79191431099144, |
|
"input_tput_per_gpu": 426.9843450804944, |
|
"mean_ttft": 0.3851088558010815, |
|
"median_ttft": 0.09621681412681937, |
|
"std_ttft": 0.8822966022008791, |
|
"p99_ttft": 4.566773009821774, |
|
"mean_tpot": 0.01784122875622258, |
|
"mean_intvty": 56.049951136421846, |
|
"median_tpot": 0.01801663246082011, |
|
"median_intvty": 55.504268190776, |
|
"std_tpot": 0.0009890226694180047, |
|
"std_intvty": 1011.0991698384981, |
|
"p99_tpot": 0.01990115349596598, |
|
"p99_intvty": 50.24834365519078, |
|
"mean_itl": 0.17756355077926556, |
|
"median_itl": 0.1462849578820169, |
|
"std_itl": 0.06702514515133351, |
|
"p99_itl": 0.423486899472773, |
|
"mean_e2el": 16.831100255452476, |
|
"median_e2el": 16.804378338623792, |
|
"std_e2el": 1.8351830520332975, |
|
"p99_e2el": 22.284333304809408 |
|
}, |
|
{ |
|
"hw": "gb200", |
|
"conc": 512, |
|
"image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", |
|
"model": "/mnt/lustre01/models/deepseek-r1-0528-fp4-v2", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "dynamo-trt", |
|
"precision": "fp4", |
|
"spec_decoding": "mtp", |
|
"disagg": true, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": true, |
|
"prefill_tp": 4, |
|
"prefill_ep": 4, |
|
"prefill_dp_attention": "true", |
|
"prefill_num_workers": 1, |
|
"decode_tp": 16, |
|
"decode_ep": 16, |
|
"decode_dp_attention": "true", |
|
"decode_num_workers": 1, |
|
"num_prefill_gpu": 4, |
|
"num_decode_gpu": 16, |
|
"tput_per_gpu": 3803.0838216335337, |
|
"output_tput_per_gpu": 2377.338614987014, |
|
"input_tput_per_gpu": 9506.064648219613, |
|
"mean_ttft": 1.0884636400741101, |
|
"median_ttft": 0.32718132459558547, |
|
"std_ttft": 2.091629423193156, |
|
"p99_ttft": 9.353588376671542, |
|
"mean_tpot": 0.011509891609823639, |
|
"mean_intvty": 86.88179123654864, |
|
"median_tpot": 0.011632329475121321, |
|
"median_intvty": 85.96730363756916, |
|
"std_tpot": 0.0010237054679599301, |
|
"std_intvty": 976.8434684566344, |
|
"p99_tpot": 0.013400882430482888, |
|
"p99_intvty": 74.62195159068834, |
|
"mean_itl": 0.5264395102407845, |
|
"median_itl": 0.540485356003046, |
|
"std_itl": 0.07010396636026237, |
|
"p99_itl": 0.5593632255285047, |
|
"mean_e2el": 11.679974086736138, |
|
"median_e2el": 11.172180174500681, |
|
"std_e2el": 2.3721075879442175, |
|
"p99_e2el": 20.32006769160507 |
|
}, |
|
{ |
|
"hw": "gb200", |
|
"conc": 1075, |
|
"image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", |
|
"model": "/mnt/lustre01/models/deepseek-r1-0528-fp4-v2", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "dynamo-trt", |
|
"precision": "fp4", |
|
"spec_decoding": "mtp", |
|
"disagg": true, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": true, |
|
"prefill_tp": 4, |
|
"prefill_ep": 4, |
|
"prefill_dp_attention": "true", |
|
"prefill_num_workers": 1, |
|
"decode_tp": 16, |
|
"decode_ep": 16, |
|
"decode_dp_attention": "true", |
|
"decode_num_workers": 1, |
|
"num_prefill_gpu": 4, |
|
"num_decode_gpu": 16, |
|
"tput_per_gpu": 5604.6687381548945, |
|
"output_tput_per_gpu": 3503.4906576192484, |
|
"input_tput_per_gpu": 14009.381060297479, |
|
"mean_ttft": 2.929410582676714, |
|
"median_ttft": 1.9941336774500087, |
|
"std_ttft": 2.9508642050011513, |
|
"p99_ttft": 15.882445676790084, |
|
"mean_tpot": 0.014985299876658552, |
|
"mean_intvty": 66.7320646387346, |
|
"median_tpot": 0.015225681108687592, |
|
"median_intvty": 65.67850678479086, |
|
"std_tpot": 0.0015733188805919138, |
|
"std_intvty": 635.5990589929106, |
|
"p99_tpot": 0.017748282728609687, |
|
"p99_intvty": 56.34347927013979, |
|
"mean_itl": 0.6921651997490522, |
|
"median_itl": 0.709355709142983, |
|
"std_itl": 0.10932932913412986, |
|
"p99_itl": 0.9747483725799247, |
|
"mean_e2el": 16.722411785457382, |
|
"median_e2el": 16.128651749459095, |
|
"std_e2el": 3.356755748706387, |
|
"p99_e2el": 30.195009535551073 |
|
}, |
|
{ |
|
"hw": "mi355x", |
|
"conc": 16, |
|
"image": "rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915", |
|
"model": "amd/DeepSeek-R1-0528-MXFP4-Preview", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "sglang", |
|
"precision": "fp4", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 8, |
|
"ep": 1, |
|
"dp_attention": "false", |
|
"tput_per_gpu": 244.90560535733977, |
|
"output_tput_per_gpu": 122.24259417377776, |
|
"input_tput_per_gpu": 122.663011183562, |
|
"mean_ttft": 0.16389791450812483, |
|
"median_ttft": 0.09834074550599325, |
|
"std_ttft": 0.1903269023558845, |
|
"p99_ttft": 1.3221142903674628, |
|
"mean_tpot": 0.016047165010289827, |
|
"mean_intvty": 62.316303182448486, |
|
"median_tpot": 0.01585998137675952, |
|
"median_intvty": 63.051776433064006, |
|
"std_tpot": 0.0006524999791487008, |
|
"std_intvty": 1532.5670987831652, |
|
"p99_tpot": 0.01831432546441761, |
|
"p99_intvty": 54.602065576636825, |
|
"mean_itl": 0.01604871906259868, |
|
"median_itl": 0.014374136982951313, |
|
"std_itl": 0.016872885345090353, |
|
"p99_itl": 0.08327452082856326, |
|
"mean_e2el": 14.922861486079965, |
|
"median_e2el": 14.877197617010097, |
|
"std_e2el": 1.1742886982132807, |
|
"p99_e2el": 18.128650223699513 |
|
}, |
|
{ |
|
"hw": "mi355x", |
|
"conc": 4, |
|
"image": "rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915", |
|
"model": "amd/DeepSeek-R1-0528-MXFP4-Preview", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "sglang", |
|
"precision": "fp4", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 4, |
|
"ep": 1, |
|
"dp_attention": "false", |
|
"tput_per_gpu": 127.09445272502897, |
|
"output_tput_per_gpu": 63.50842874187588, |
|
"input_tput_per_gpu": 63.58602398315309, |
|
"mean_ttft": 0.22369928099025857, |
|
"median_ttft": 0.20476960801170208, |
|
"std_ttft": 0.25455547776659093, |
|
"p99_ttft": 1.3235857760004002, |
|
"mean_tpot": 0.015406933129963014, |
|
"mean_intvty": 64.90584411346768, |
|
"median_tpot": 0.015378344757907196, |
|
"median_intvty": 65.02650420070877, |
|
"std_tpot": 0.00029232644196876235, |
|
"std_intvty": 3420.833207099544, |
|
"p99_tpot": 0.016509211088623917, |
|
"p99_intvty": 60.572246283111305, |
|
"mean_itl": 0.015407714691845254, |
|
"median_itl": 0.014949633012292907, |
|
"std_itl": 0.010987243014788517, |
|
"p99_itl": 0.015541010827291757, |
|
"mean_e2el": 14.395175752249342, |
|
"median_e2el": 14.387992981515708, |
|
"std_e2el": 1.024045564918163, |
|
"p99_e2el": 16.45127784504089 |
|
}, |
|
{ |
|
"hw": "mi355x", |
|
"conc": 4, |
|
"image": "rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915", |
|
"model": "deepseek-ai/DeepSeek-R1-0528", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "sglang", |
|
"precision": "fp8", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 8, |
|
"ep": 1, |
|
"dp_attention": "false", |
|
"tput_per_gpu": 53.50162023190284, |
|
"output_tput_per_gpu": 26.734477888062617, |
|
"input_tput_per_gpu": 26.767142343840224, |
|
"mean_ttft": 0.10254724691141746, |
|
"median_ttft": 0.09931363150826655, |
|
"std_ttft": 0.020107927751740414, |
|
"p99_ttft": 0.23979883406020236, |
|
"mean_tpot": 0.01846855797035394, |
|
"mean_intvty": 54.14607906070512, |
|
"median_tpot": 0.018474835274947463, |
|
"median_intvty": 54.12768152558501, |
|
"std_tpot": 7.746174410811525e-05, |
|
"std_intvty": 12909.59829931373, |
|
"p99_tpot": 0.01859121090967565, |
|
"p99_intvty": 53.78885780267049, |
|
"mean_itl": 0.018468433167693556, |
|
"median_itl": 0.01827680302085355, |
|
"std_itl": 0.0028933581315699784, |
|
"p99_itl": 0.01859799599274993, |
|
"mean_e2el": 17.089165465495462, |
|
"median_e2el": 17.158865097997477, |
|
"std_e2el": 1.1077995214102774, |
|
"p99_e2el": 18.995283529475564 |
|
}, |
|
{ |
|
"hw": "b200", |
|
"conc": 64, |
|
"image": "lmsysorg/sglang:v0.5.5-cu129-amd64", |
|
"model": "nvidia/DeepSeek-R1-0528-FP4-V2", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "sglang", |
|
"precision": "fp4", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 4, |
|
"ep": 4, |
|
"dp_attention": "false", |
|
"tput_per_gpu": 1096.2822242721327, |
|
"output_tput_per_gpu": 548.0175678059866, |
|
"input_tput_per_gpu": 548.2646564661461, |
|
"mean_ttft": 7.17844623914525, |
|
"median_ttft": 0.4653839475067798, |
|
"std_ttft": 20.386307459943882, |
|
"p99_ttft": 69.42139958190266, |
|
"mean_tpot": 0.020794494405061315, |
|
"mean_intvty": 48.08965202619224, |
|
"median_tpot": 0.020746625585331274, |
|
"median_intvty": 48.20060958284424, |
|
"std_tpot": 0.0028751198593529735, |
|
"std_intvty": 347.81158661852913, |
|
"p99_tpot": 0.021941996993297568, |
|
"p99_intvty": 45.57470317334659, |
|
"mean_itl": 0.2071506984134482, |
|
"median_itl": 0.18764059199020267, |
|
"std_itl": 0.28859335818358, |
|
"p99_itl": 0.5049551923532272, |
|
"mean_e2el": 26.344740840698705, |
|
"median_e2el": 19.711951009492623, |
|
"std_e2el": 20.784469584212715, |
|
"p99_e2el": 90.65216694450035 |
|
}, |
|
{ |
|
"hw": "h200", |
|
"conc": 16, |
|
"image": "lmsysorg/sglang:v0.5.5-cu129-amd64", |
|
"model": "deepseek-ai/DeepSeek-R1-0528", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "sglang", |
|
"precision": "fp8", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 8, |
|
"ep": 1, |
|
"dp_attention": "false", |
|
"tput_per_gpu": 218.9619801747013, |
|
"output_tput_per_gpu": 108.90134419181408, |
|
"input_tput_per_gpu": 110.06063598288722, |
|
"mean_ttft": 0.2250255027145613, |
|
"median_ttft": 0.16274974308907986, |
|
"std_ttft": 0.16713294232046957, |
|
"p99_ttft": 0.7348776398133486, |
|
"mean_tpot": 0.017778192618412375, |
|
"mean_intvty": 56.24868744893269, |
|
"median_tpot": 0.017953584201255614, |
|
"median_intvty": 55.69918456338446, |
|
"std_tpot": 0.000767799496961416, |
|
"std_intvty": 1302.4233591680156, |
|
"p99_tpot": 0.01863887407773945, |
|
"p99_intvty": 53.65130939933263, |
|
"mean_itl": 0.1770341265797833, |
|
"median_itl": 0.15951490867882967, |
|
"std_itl": 0.05625378081156495, |
|
"p99_itl": 0.42379419051110745, |
|
"mean_e2el": 16.499993851641193, |
|
"median_e2el": 16.441055288538337, |
|
"std_e2el": 1.38702436282795, |
|
"p99_e2el": 18.874534588856623 |
|
}, |
|
{ |
|
"hw": "b200-trt", |
|
"conc": 4, |
|
"image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2", |
|
"model": "nvidia/DeepSeek-R1-0528-FP4-V2", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "trt", |
|
"precision": "fp4", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 8, |
|
"ep": 1, |
|
"dp_attention": "false", |
|
"tput_per_gpu": 130.39207323866637, |
|
"output_tput_per_gpu": 64.87879171717749, |
|
"input_tput_per_gpu": 65.51328152148888, |
|
"mean_ttft": 0.10384224493755027, |
|
"median_ttft": 0.08351057092659175, |
|
"std_ttft": 0.059050758293203054, |
|
"p99_ttft": 0.3299985429877415, |
|
"mean_tpot": 0.007367507908830015, |
|
"mean_intvty": 135.73110641679696, |
|
"median_tpot": 0.007388820195880784, |
|
"median_intvty": 135.33960408963438, |
|
"std_tpot": 0.00010658819974353541, |
|
"std_intvty": 9381.90158391009, |
|
"p99_tpot": 0.0075063634329180055, |
|
"p99_intvty": 133.22030153971141, |
|
"mean_itl": 0.07320066111980335, |
|
"median_itl": 0.07186384731903672, |
|
"std_itl": 0.011997052549359141, |
|
"p99_itl": 0.13464706712402402, |
|
"mean_e2el": 6.8584330716519615, |
|
"median_e2el": 6.9107868985738605, |
|
"std_e2el": 0.4856356363968179, |
|
"p99_e2el": 7.7363094548135996 |
|
}, |
|
{ |
|
"hw": "b200-trt", |
|
"conc": 32, |
|
"image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2", |
|
"model": "nvidia/DeepSeek-R1-0528-FP4-V2", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "trt", |
|
"precision": "fp4", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 8, |
|
"ep": 8, |
|
"dp_attention": "false", |
|
"tput_per_gpu": 524.4126827001851, |
|
"output_tput_per_gpu": 262.6217119264341, |
|
"input_tput_per_gpu": 261.790970773751, |
|
"mean_ttft": 0.28176771416274277, |
|
"median_ttft": 0.10523877049854491, |
|
"std_ttft": 0.5514158412173853, |
|
"p99_ttft": 2.8611615708237514, |
|
"mean_tpot": 0.014580355019460913, |
|
"mean_intvty": 68.58543558543428, |
|
"median_tpot": 0.014660147404230436, |
|
"median_intvty": 68.21213814749454, |
|
"std_tpot": 0.0006148540724014421, |
|
"std_intvty": 1626.4021739244392, |
|
"p99_tpot": 0.015716656868408684, |
|
"p99_intvty": 63.626762890653495, |
|
"mean_itl": 0.14502292385488288, |
|
"median_itl": 0.12457800451375078, |
|
"std_itl": 0.05289910028839434, |
|
"p99_itl": 0.33641460732236733, |
|
"mean_e2el": 13.741707663203396, |
|
"median_e2el": 13.719763337488985, |
|
"std_e2el": 1.291835065639702, |
|
"p99_e2el": 17.680752003692323 |
|
}, |
|
{ |
|
"hw": "mi325x", |
|
"conc": 8, |
|
"image": "rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915", |
|
"model": "deepseek-ai/DeepSeek-R1-0528", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "sglang", |
|
"precision": "fp8", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 8, |
|
"ep": 1, |
|
"dp_attention": "false", |
|
"tput_per_gpu": 83.30705438438346, |
|
"output_tput_per_gpu": 41.80793180539006, |
|
"input_tput_per_gpu": 41.4991225789934, |
|
"mean_ttft": 0.1667088986840099, |
|
"median_ttft": 0.12355327396653593, |
|
"std_ttft": 0.16357438312889708, |
|
"p99_ttft": 0.9656905179191381, |
|
"mean_tpot": 0.02320572207020305, |
|
"mean_intvty": 43.09281982154025, |
|
"median_tpot": 0.023210967987731892, |
|
"median_intvty": 43.08308040097888, |
|
"std_tpot": 0.0002524520409704655, |
|
"std_intvty": 3961.1484072612056, |
|
"p99_tpot": 0.023800257855761127, |
|
"p99_intvty": 42.01635150595389, |
|
"mean_itl": 0.02321012615910169, |
|
"median_itl": 0.02265459089539945, |
|
"std_itl": 0.007624513065071387, |
|
"p99_itl": 0.05260278491768986, |
|
"mean_e2el": 21.70744649678818, |
|
"median_e2el": 21.906510022003204, |
|
"std_e2el": 1.4592142843028952, |
|
"p99_e2el": 24.032168783559463 |
|
}, |
|
{ |
|
"hw": "b200-trt", |
|
"conc": 256, |
|
"image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2", |
|
"model": "nvidia/DeepSeek-R1-0528-FP4-V2", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "trt", |
|
"precision": "fp4", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 4, |
|
"ep": 4, |
|
"dp_attention": "true", |
|
"tput_per_gpu": 3435.18361133009, |
|
"output_tput_per_gpu": 1718.4222304210036, |
|
"input_tput_per_gpu": 1716.7613809090865, |
|
"mean_ttft": 1.1643840026619727, |
|
"median_ttft": 0.8834498869255185, |
|
"std_ttft": 1.1624854838093355, |
|
"p99_ttft": 5.811972907194868, |
|
"mean_tpot": 0.03447258798480657, |
|
"mean_intvty": 29.008556028364904, |
|
"median_tpot": 0.034761480082379614, |
|
"median_intvty": 28.76747473439412, |
|
"std_tpot": 0.0011874833520894386, |
|
"std_intvty": 842.1170690439138, |
|
"p99_tpot": 0.03600821591107884, |
|
"p99_intvty": 27.771439786671703, |
|
"mean_itl": 0.3428813619008422, |
|
"median_itl": 0.30835804296657443, |
|
"std_itl": 0.07041792406889948, |
|
"p99_itl": 0.6002769273146986, |
|
"mean_e2el": 32.929797214319116, |
|
"median_e2el": 32.91153870499693, |
|
"std_e2el": 2.7731514659221888, |
|
"p99_e2el": 39.17268681048881 |
|
}, |
|
{ |
|
"hw": "b200", |
|
"conc": 8, |
|
"image": "lmsysorg/sglang:v0.5.5-cu129-amd64", |
|
"model": "nvidia/DeepSeek-R1-0528-FP4-V2", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "sglang", |
|
"precision": "fp4", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 4, |
|
"ep": 4, |
|
"dp_attention": "false", |
|
"tput_per_gpu": 365.5228885167477, |
|
"output_tput_per_gpu": 183.4389189408438, |
|
"input_tput_per_gpu": 182.0839695759039, |
|
"mean_ttft": 0.21978793078742456, |
|
"median_ttft": 0.16096528299385682, |
|
"std_ttft": 0.1827035024344528, |
|
"p99_ttft": 0.7498866373926285, |
|
"mean_tpot": 0.010447272835718634, |
|
"mean_intvty": 95.71875988354172, |
|
"median_tpot": 0.01041591579162534, |
|
"median_intvty": 96.00692056324277, |
|
"std_tpot": 0.00035011992456693975, |
|
"std_intvty": 2856.1642164092523, |
|
"p99_tpot": 0.011324179912974196, |
|
"p99_intvty": 88.30661537391265, |
|
"mean_itl": 0.10398533259301215, |
|
"median_itl": 0.09850755799561739, |
|
"std_itl": 0.03151717884306649, |
|
"p99_itl": 0.1843689631097369, |
|
"mean_e2el": 9.919019663974177, |
|
"median_e2el": 9.89528558299935, |
|
"std_e2el": 0.7666360249072773, |
|
"p99_e2el": 11.565644675162329 |
|
}, |
|
{ |
|
"hw": "b200-trt", |
|
"conc": 128, |
|
"image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2", |
|
"model": "nvidia/DeepSeek-R1-0528-FP4-V2", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "trt", |
|
"precision": "fp4", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 8, |
|
"ep": 8, |
|
"dp_attention": "false", |
|
"tput_per_gpu": 1125.7129527716302, |
|
"output_tput_per_gpu": 562.2389021650472, |
|
"input_tput_per_gpu": 563.474050606583, |
|
"mean_ttft": 0.7920400589951896, |
|
"median_ttft": 0.1966307605180191, |
|
"std_ttft": 2.0921973819547266, |
|
"p99_ttft": 10.783431073446994, |
|
"mean_tpot": 0.026992523938468523, |
|
"mean_intvty": 37.04729510584393, |
|
"median_tpot": 0.027415059531668028, |
|
"median_intvty": 36.47630233466637, |
|
"std_tpot": 0.0021903253262609518, |
|
"std_intvty": 456.55318322373347, |
|
"p99_tpot": 0.031001554485203708, |
|
"p99_intvty": 32.25644702678621, |
|
"mean_itl": 0.2686317336462882, |
|
"median_itl": 0.1916247960034525, |
|
"std_itl": 0.12590846221249627, |
|
"p99_itl": 0.7419374693455757, |
|
"mean_e2el": 25.625784449898287, |
|
"median_e2el": 25.4115537799953, |
|
"std_e2el": 3.5966496141912545, |
|
"p99_e2el": 37.37643152206612 |
|
}, |
|
{ |
|
"hw": "mi355x", |
|
"conc": 64, |
|
"image": "rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915", |
|
"model": "amd/DeepSeek-R1-0528-MXFP4-Preview", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "sglang", |
|
"precision": "fp4", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 4, |
|
"ep": 1, |
|
"dp_attention": "false", |
|
"tput_per_gpu": 817.5906995985575, |
|
"output_tput_per_gpu": 408.9344372523261, |
|
"input_tput_per_gpu": 408.6562623462314, |
|
"mean_ttft": 0.26780416145786146, |
|
"median_ttft": 0.14544924450456165, |
|
"std_ttft": 0.6281365506682745, |
|
"p99_ttft": 4.384220488306019, |
|
"mean_tpot": 0.03861481596860886, |
|
"mean_intvty": 25.89679569657745, |
|
"median_tpot": 0.038087981672516535, |
|
"median_intvty": 26.255001081393043, |
|
"std_tpot": 0.0021292654234834345, |
|
"std_intvty": 469.64553548426136, |
|
"p99_tpot": 0.046742885398299544, |
|
"p99_intvty": 21.393630099616807, |
|
"mean_itl": 0.038622826603984264, |
|
"median_itl": 0.03207420802209526, |
|
"std_itl": 0.027712658850288827, |
|
"p99_itl": 0.1306680386900555, |
|
"mean_e2el": 35.83653053449152, |
|
"median_e2el": 35.64918338600546, |
|
"std_e2el": 3.283118759795647, |
|
"p99_e2el": 46.87365877556817 |
|
}, |
|
{ |
|
"hw": "b200-trt", |
|
"conc": 64, |
|
"image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2", |
|
"model": "deepseek-ai/DeepSeek-R1-0528", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "trt", |
|
"precision": "fp8", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 8, |
|
"ep": 8, |
|
"dp_attention": "true", |
|
"tput_per_gpu": 515.684617321858, |
|
"output_tput_per_gpu": 257.78419414518766, |
|
"input_tput_per_gpu": 257.90042317667036, |
|
"mean_ttft": 1.6282350063396733, |
|
"median_ttft": 1.4507918141316622, |
|
"std_ttft": 1.0682361391017958, |
|
"p99_ttft": 4.793210491444916, |
|
"mean_tpot": 0.027976202421423374, |
|
"mean_intvty": 35.74466558885879, |
|
"median_tpot": 0.02801925929453371, |
|
"median_intvty": 35.689737172855615, |
|
"std_tpot": 0.00040506612311662317, |
|
"std_intvty": 2468.732739992894, |
|
"p99_tpot": 0.02864856500094191, |
|
"p99_intvty": 34.90576229445077, |
|
"mean_itl": 0.27817329930642665, |
|
"median_itl": 0.2683784537948668, |
|
"std_itl": 0.038889263577688474, |
|
"p99_itl": 0.4092893096618354, |
|
"mean_e2el": 27.392732787023853, |
|
"median_e2el": 27.443264302331954, |
|
"std_e2el": 1.9662657146460998, |
|
"p99_e2el": 32.067206466505304 |
|
}, |
|
{ |
|
"hw": "mi355x", |
|
"conc": 64, |
|
"image": "rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915", |
|
"model": "deepseek-ai/DeepSeek-R1-0528", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "sglang", |
|
"precision": "fp8", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 8, |
|
"ep": 1, |
|
"dp_attention": "false", |
|
"tput_per_gpu": 506.58988065776214, |
|
"output_tput_per_gpu": 253.38112073219884, |
|
"input_tput_per_gpu": 253.2087599255633, |
|
"mean_ttft": 0.15677339283993205, |
|
"median_ttft": 0.11557232451741584, |
|
"std_ttft": 0.24376142183145486, |
|
"p99_ttft": 1.55091424578859, |
|
"mean_tpot": 0.031155865704754086, |
|
"mean_intvty": 32.09668476159241, |
|
"median_tpot": 0.03126428169717604, |
|
"median_intvty": 31.985382222625173, |
|
"std_tpot": 0.0006454958370875492, |
|
"std_intvty": 1549.196667962971, |
|
"p99_tpot": 0.03177064982995281, |
|
"p99_intvty": 31.475591634176066, |
|
"mean_itl": 0.03116330467903974, |
|
"median_itl": 0.027022671012673527, |
|
"std_itl": 0.013900402869031152, |
|
"p99_itl": 0.08222846371354539, |
|
"mean_e2el": 28.854641751212547, |
|
"median_e2el": 28.862312199518783, |
|
"std_e2el": 1.987083004970599, |
|
"p99_e2el": 32.30646628647577 |
|
}, |
|
{ |
|
"hw": "b200", |
|
"conc": 16, |
|
"image": "lmsysorg/sglang:v0.5.5-cu129-amd64", |
|
"model": "nvidia/DeepSeek-R1-0528-FP4-V2", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "sglang", |
|
"precision": "fp4", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 8, |
|
"ep": 8, |
|
"dp_attention": "false", |
|
"tput_per_gpu": 345.121590316537, |
|
"output_tput_per_gpu": 171.64717392992364, |
|
"input_tput_per_gpu": 173.47441638661334, |
|
"mean_ttft": 0.4287117916886928, |
|
"median_ttft": 0.29100034397561103, |
|
"std_ttft": 0.3754468470905531, |
|
"p99_ttft": 1.449857907204423, |
|
"mean_tpot": 0.010927759724302835, |
|
"mean_intvty": 91.51006475517997, |
|
"median_tpot": 0.010944936333709682, |
|
"median_intvty": 91.36645198383347, |
|
"std_tpot": 0.0004429086785558657, |
|
"std_intvty": 2257.801773631009, |
|
"p99_tpot": 0.011884931081937746, |
|
"p99_intvty": 84.14015976245423, |
|
"mean_itl": 0.10880163688398721, |
|
"median_itl": 0.09833925892598927, |
|
"std_itl": 0.04595109893382239, |
|
"p99_itl": 0.20008037610910834, |
|
"mean_e2el": 10.430982121020497, |
|
"median_e2el": 10.334178404998966, |
|
"std_e2el": 0.8883035576179695, |
|
"p99_e2el": 12.850713363841642 |
|
}, |
|
{ |
|
"hw": "b200", |
|
"conc": 16, |
|
"image": "lmsysorg/sglang:v0.5.5-cu129-amd64", |
|
"model": "nvidia/DeepSeek-R1-0528-FP4-V2", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "sglang", |
|
"precision": "fp4", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 4, |
|
"ep": 4, |
|
"dp_attention": "false", |
|
"tput_per_gpu": 591.6975315742928, |
|
"output_tput_per_gpu": 294.28239775694084, |
|
"input_tput_per_gpu": 297.415133817352, |
|
"mean_ttft": 0.4090025126695764, |
|
"median_ttft": 0.32702109598903917, |
|
"std_ttft": 0.2800616933864141, |
|
"p99_ttft": 1.1169804487048531, |
|
"mean_tpot": 0.012855804366015569, |
|
"mean_intvty": 77.78587566589833, |
|
"median_tpot": 0.01288727161423733, |
|
"median_intvty": 77.59594349631314, |
|
"std_tpot": 0.0004046021148751569, |
|
"std_intvty": 2471.563946986678, |
|
"p99_tpot": 0.013792283890053094, |
|
"p99_intvty": 72.50430805888455, |
|
"mean_itl": 0.12799021825366264, |
|
"median_itl": 0.11939197301398963, |
|
"std_itl": 0.04085137925509727, |
|
"p99_itl": 0.2096828156861011, |
|
"mean_e2el": 12.175303138899835, |
|
"median_e2el": 12.111572517009336, |
|
"std_e2el": 0.9588628085163959, |
|
"p99_e2el": 14.376447648540488 |
|
}, |
|
{ |
|
"hw": "b200", |
|
"conc": 32, |
|
"image": "lmsysorg/sglang:v0.5.5-cu129-amd64", |
|
"model": "nvidia/DeepSeek-R1-0528-FP4-V2", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "sglang", |
|
"precision": "fp4", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 8, |
|
"ep": 8, |
|
"dp_attention": "false", |
|
"tput_per_gpu": 376.08593596306514, |
|
"output_tput_per_gpu": 188.34085366799343, |
|
"input_tput_per_gpu": 187.7450822950717, |
|
"mean_ttft": 6.998798271215582, |
|
"median_ttft": 0.33056944848794956, |
|
"std_ttft": 20.38128105719959, |
|
"p99_ttft": 69.39045825751091, |
|
"mean_tpot": 0.013326483664183916, |
|
"mean_intvty": 75.03854919265665, |
|
"median_tpot": 0.013125619920767528, |
|
"median_intvty": 76.18687772741208, |
|
"std_tpot": 0.004215008131052788, |
|
"std_intvty": 237.24746641241447, |
|
"p99_tpot": 0.013970387824769476, |
|
"p99_intvty": 71.57997419563411, |
|
"mean_itl": 0.13259072985880746, |
|
"median_itl": 0.1174843479966512, |
|
"std_itl": 0.39869151662864727, |
|
"p99_itl": 0.26701540267007656, |
|
"mean_e2el": 19.297417002759357, |
|
"median_e2el": 12.587621595492237, |
|
"std_e2el": 20.635825312768564, |
|
"p99_e2el": 82.8554250155951 |
|
}, |
|
{ |
|
"hw": "gb200", |
|
"conc": 2252, |
|
"image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", |
|
"model": "/mnt/lustre01/models/deepseek-r1-0528-fp4-v2", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "dynamo-trt", |
|
"precision": "fp4", |
|
"spec_decoding": "mtp", |
|
"disagg": true, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": true, |
|
"prefill_tp": 4, |
|
"prefill_ep": 4, |
|
"prefill_dp_attention": "true", |
|
"prefill_num_workers": 1, |
|
"decode_tp": 8, |
|
"decode_ep": 8, |
|
"decode_dp_attention": "true", |
|
"decode_num_workers": 1, |
|
"num_prefill_gpu": 4, |
|
"num_decode_gpu": 8, |
|
"tput_per_gpu": 9565.964844444936, |
|
"output_tput_per_gpu": 7176.685588522945, |
|
"input_tput_per_gpu": 14344.523356288919, |
|
"mean_ttft": 5.374154983334451, |
|
"median_ttft": 3.135583871626295, |
|
"std_ttft": 6.370289718143169, |
|
"p99_ttft": 32.83654104531736, |
|
"mean_tpot": 0.03158022985201884, |
|
"mean_intvty": 31.6653806728412, |
|
"median_tpot": 0.0320205555867729, |
|
"median_intvty": 31.22993907117219, |
|
"std_tpot": 0.002056902606019846, |
|
"std_intvty": 486.1678900465896, |
|
"p99_tpot": 0.0344045670103347, |
|
"p99_intvty": 29.0659085957865, |
|
"mean_itl": 1.093646306347774, |
|
"median_itl": 1.1256944275228307, |
|
"std_itl": 0.17994813718568728, |
|
"p99_itl": 1.5281442321743817, |
|
"mean_e2el": 34.46553513161057, |
|
"median_e2el": 32.975652318564244, |
|
"std_e2el": 6.723949130269238, |
|
"p99_e2el": 62.323754684999585 |
|
}, |
|
{ |
|
"hw": "h200", |
|
"conc": 8, |
|
"image": "lmsysorg/sglang:v0.5.5-cu129-amd64", |
|
"model": "deepseek-ai/DeepSeek-R1-0528", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "sglang", |
|
"precision": "fp8", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 8, |
|
"ep": 1, |
|
"dp_attention": "false", |
|
"tput_per_gpu": 144.44045336262005, |
|
"output_tput_per_gpu": 72.48793837147186, |
|
"input_tput_per_gpu": 71.95251499114819, |
|
"mean_ttft": 0.15794080712657887, |
|
"median_ttft": 0.11560489097610116, |
|
"std_ttft": 0.11723623617261816, |
|
"p99_ttft": 0.5249040435464122, |
|
"mean_tpot": 0.013352168637406769, |
|
"mean_intvty": 74.89420087149362, |
|
"median_tpot": 0.013362889827442888, |
|
"median_intvty": 74.83411244971397, |
|
"std_tpot": 0.00042764836222422857, |
|
"std_intvty": 2338.3697643524956, |
|
"p99_tpot": 0.014155767784631543, |
|
"p99_intvty": 70.6425829537602, |
|
"mean_itl": 0.13288864953492957, |
|
"median_itl": 0.12755411153193563, |
|
"std_itl": 0.026600275860857486, |
|
"p99_itl": 0.22374155048746616, |
|
"mean_e2el": 12.55312944536272, |
|
"median_e2el": 12.541978692403063, |
|
"std_e2el": 0.9164543979314959, |
|
"p99_e2el": 14.549708433914928 |
|
}, |
|
{ |
|
"hw": "mi325x", |
|
"conc": 32, |
|
"image": "rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915", |
|
"model": "deepseek-ai/DeepSeek-R1-0528", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "sglang", |
|
"precision": "fp8", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 8, |
|
"ep": 1, |
|
"dp_attention": "false", |
|
"tput_per_gpu": 265.85040830932485, |
|
"output_tput_per_gpu": 133.13577579215888, |
|
"input_tput_per_gpu": 132.71463251716597, |
|
"mean_ttft": 0.3410089746554149, |
|
"median_ttft": 0.1312808699440211, |
|
"std_ttft": 0.6103937126254345, |
|
"p99_ttft": 2.2063028247514738, |
|
"mean_tpot": 0.0288512910442555, |
|
"mean_intvty": 34.6604939954362, |
|
"median_tpot": 0.029053521512887044, |
|
"median_intvty": 34.41923553247884, |
|
"std_tpot": 0.0008179312454917885, |
|
"std_intvty": 1222.5966491825861, |
|
"p99_tpot": 0.029924656885910495, |
|
"p99_intvty": 33.41725867777059, |
|
"mean_itl": 0.028863966021323222, |
|
"median_itl": 0.026629538740962744, |
|
"std_itl": 0.01149186789899497, |
|
"p99_itl": 0.0748907681554556, |
|
"mean_e2el": 26.973519575050158, |
|
"median_e2el": 27.014708928065374, |
|
"std_e2el": 2.015044186051719, |
|
"p99_e2el": 31.37833488534205 |
|
}, |
|
{ |
|
"hw": "h200", |
|
"conc": 16, |
|
"image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2", |
|
"model": "deepseek-ai/DeepSeek-R1-0528", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "trt", |
|
"precision": "fp8", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 8, |
|
"ep": 8, |
|
"dp_attention": "false", |
|
"tput_per_gpu": 188.60095504951445, |
|
"output_tput_per_gpu": 93.801204685694, |
|
"input_tput_per_gpu": 94.79975036382045, |
|
"mean_ttft": 0.6688403705309611, |
|
"median_ttft": 0.1129344068467617, |
|
"std_ttft": 1.7241678215113387, |
|
"p99_ttft": 7.4615381137467915, |
|
"mean_tpot": 0.02018827097949587, |
|
"mean_intvty": 49.53371197640677, |
|
"median_tpot": 0.02006386798346753, |
|
"median_intvty": 49.84083830814638, |
|
"std_tpot": 0.0010003112701330579, |
|
"std_intvty": 999.6888267258886, |
|
"p99_tpot": 0.023235139422495135, |
|
"p99_intvty": 43.038261222217955, |
|
"mean_itl": 0.2007144330078277, |
|
"median_itl": 0.19081574399024248, |
|
"std_itl": 0.09424814280618914, |
|
"p99_itl": 0.3050570430606604, |
|
"mean_e2el": 19.140840336726978, |
|
"median_e2el": 18.724031593184918, |
|
"std_e2el": 2.415333818167487, |
|
"p99_e2el": 26.955082109412178 |
|
}, |
|
{ |
|
"hw": "mi300x", |
|
"conc": 16, |
|
"image": "rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915", |
|
"model": "deepseek-ai/DeepSeek-R1-0528", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "sglang", |
|
"precision": "fp8", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 8, |
|
"ep": 1, |
|
"dp_attention": "false", |
|
"tput_per_gpu": 145.9407901044831, |
|
"output_tput_per_gpu": 72.58405409977149, |
|
"input_tput_per_gpu": 73.35673600471162, |
|
"mean_ttft": 0.28639606877113694, |
|
"median_ttft": 0.15204137470573187, |
|
"std_ttft": 0.3995107737380535, |
|
"p99_ttft": 1.528022503554821, |
|
"mean_tpot": 0.026627124813234943, |
|
"mean_intvty": 37.555688307096254, |
|
"median_tpot": 0.02677065003850398, |
|
"median_intvty": 37.354341361218694, |
|
"std_tpot": 0.0005422849115978797, |
|
"std_intvty": 1844.0490941439461, |
|
"p99_tpot": 0.02732710420093871, |
|
"p99_intvty": 36.59370537935187, |
|
"mean_itl": 0.026633314140908092, |
|
"median_itl": 0.02516160160303116, |
|
"std_itl": 0.012500358947689737, |
|
"p99_itl": 0.12531440477818248, |
|
"mean_e2el": 24.65304853579728, |
|
"median_e2el": 24.512640615925193, |
|
"std_e2el": 1.7986590725432812, |
|
"p99_e2el": 28.2529873347003 |
|
}, |
|
{ |
|
"hw": "gb200", |
|
"conc": 64, |
|
"image": "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1", |
|
"model": "deepseek-ai/DeepSeek-R1", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "dynamo-sglang", |
|
"precision": "fp8", |
|
"spec_decoding": "none", |
|
"disagg": true, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": true, |
|
"prefill_tp": 1, |
|
"prefill_ep": 1, |
|
"prefill_dp_attention": "true", |
|
"prefill_num_workers": 1, |
|
"decode_tp": 1, |
|
"decode_ep": 1, |
|
"decode_dp_attention": "true", |
|
"decode_num_workers": 4, |
|
"num_prefill_gpu": 4, |
|
"num_decode_gpu": 16, |
|
"tput_per_gpu": 369.2547065443617, |
|
"output_tput_per_gpu": 231.14978519935755, |
|
"input_tput_per_gpu": 921.6743919243781, |
|
"mean_ttft": 0.8270642481969844, |
|
"median_ttft": 0.4514468389097601, |
|
"std_ttft": 0.9972101875230466, |
|
"p99_ttft": 4.402157481594477, |
|
"mean_tpot": 0.01551686899186248, |
|
"mean_intvty": 64.44599103881269, |
|
"median_tpot": 0.015370686094954806, |
|
"median_intvty": 65.0589045812493, |
|
"std_tpot": 0.0006835531952533813, |
|
"std_intvty": 1462.9439331774572, |
|
"p99_tpot": 0.01698190977734623, |
|
"p99_intvty": 58.88619201910932, |
|
"mean_itl": 0.15433621767522684, |
|
"median_itl": 0.1543394629843533, |
|
"std_itl": 0.01465307230234735, |
|
"p99_itl": 0.18508471085689962, |
|
"mean_e2el": 15.14271294083519, |
|
"median_e2el": 15.005144527531229, |
|
"std_e2el": 1.4340534060719854, |
|
"p99_e2el": 19.146446354161018 |
|
}, |
|
{ |
|
"hw": "gb200", |
|
"conc": 16, |
|
"image": "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1", |
|
"model": "deepseek-ai/DeepSeek-R1", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "dynamo-sglang", |
|
"precision": "fp8", |
|
"spec_decoding": "none", |
|
"disagg": true, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": true, |
|
"prefill_tp": 1, |
|
"prefill_ep": 1, |
|
"prefill_dp_attention": "true", |
|
"prefill_num_workers": 1, |
|
"decode_tp": 1, |
|
"decode_ep": 1, |
|
"decode_dp_attention": "true", |
|
"decode_num_workers": 4, |
|
"num_prefill_gpu": 4, |
|
"num_decode_gpu": 16, |
|
"tput_per_gpu": 142.42458138976733, |
|
"output_tput_per_gpu": 89.34533260953395, |
|
"input_tput_per_gpu": 354.74157651070084, |
|
"mean_ttft": 0.43835763942915945, |
|
"median_ttft": 0.3568364870734513, |
|
"std_ttft": 0.26540060765389173, |
|
"p99_ttft": 1.187165441869292, |
|
"mean_tpot": 0.010141933251875089, |
|
"mean_intvty": 98.60053060545583, |
|
"median_tpot": 0.01014871645207556, |
|
"median_intvty": 98.53462797213982, |
|
"std_tpot": 0.00013018611906185802, |
|
"std_intvty": 7681.310474620182, |
|
"p99_tpot": 0.01047412996031159, |
|
"p99_intvty": 95.47332368313019, |
|
"mean_itl": 0.10093194153805742, |
|
"median_itl": 0.10092792799696326, |
|
"std_itl": 0.006848242090987182, |
|
"p99_itl": 0.12125953753711656, |
|
"mean_e2el": 9.852784376783529, |
|
"median_e2el": 9.80505621805787, |
|
"std_e2el": 0.7130732098334741, |
|
"p99_e2el": 11.30290693300776 |
|
}, |
|
{ |
|
"hw": "gb200", |
|
"conc": 8, |
|
"image": "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1", |
|
"model": "deepseek-ai/DeepSeek-R1", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "dynamo-sglang", |
|
"precision": "fp8", |
|
"spec_decoding": "none", |
|
"disagg": true, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": true, |
|
"prefill_tp": 1, |
|
"prefill_ep": 1, |
|
"prefill_dp_attention": "true", |
|
"prefill_num_workers": 1, |
|
"decode_tp": 1, |
|
"decode_ep": 1, |
|
"decode_dp_attention": "true", |
|
"decode_num_workers": 4, |
|
"num_prefill_gpu": 4, |
|
"num_decode_gpu": 16, |
|
"tput_per_gpu": 73.46534526379665, |
|
"output_tput_per_gpu": 45.6924136127911, |
|
"input_tput_per_gpu": 184.55707186781882, |
|
"mean_ttft": 0.39117412944906393, |
|
"median_ttft": 0.3618117090081796, |
|
"std_ttft": 0.14675603200311926, |
|
"p99_ttft": 0.6161122514656745, |
|
"mean_tpot": 0.010038821876919363, |
|
"mean_intvty": 99.61328254056762, |
|
"median_tpot": 0.010071348848853455, |
|
"median_intvty": 99.2915661057498, |
|
"std_tpot": 0.00013552325700286308, |
|
"std_intvty": 7378.8073140750585, |
|
"p99_tpot": 0.010216100507633495, |
|
"p99_intvty": 97.88470652308065, |
|
"mean_itl": 0.09994822494875504, |
|
"median_itl": 0.1004420870449394, |
|
"std_itl": 0.00584334918747307, |
|
"p99_itl": 0.10443910209229217, |
|
"mean_e2el": 9.596405503235292, |
|
"median_e2el": 9.610710931476206, |
|
"std_e2el": 0.6846882197889491, |
|
"p99_e2el": 10.864912356187125 |
|
}, |
|
{ |
|
"hw": "gb200", |
|
"conc": 2, |
|
"image": "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1", |
|
"model": "deepseek-ai/DeepSeek-R1", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "dynamo-sglang", |
|
"precision": "fp8", |
|
"spec_decoding": "none", |
|
"disagg": true, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": true, |
|
"prefill_tp": 1, |
|
"prefill_ep": 1, |
|
"prefill_dp_attention": "true", |
|
"prefill_num_workers": 1, |
|
"decode_tp": 1, |
|
"decode_ep": 1, |
|
"decode_dp_attention": "true", |
|
"decode_num_workers": 4, |
|
"num_prefill_gpu": 4, |
|
"num_decode_gpu": 16, |
|
"tput_per_gpu": 19.16741824702025, |
|
"output_tput_per_gpu": 12.022041042316383, |
|
"input_tput_per_gpu": 47.74892706583572, |
|
"mean_ttft": 0.2508541756309569, |
|
"median_ttft": 0.2253928908612579, |
|
"std_ttft": 0.05659605374956713, |
|
"p99_ttft": 0.37547269876580686, |
|
"mean_tpot": 0.009992066656203408, |
|
"mean_intvty": 100.07939642588018, |
|
"median_tpot": 0.009923142353868803, |
|
"median_intvty": 100.77452931128447, |
|
"std_tpot": 0.00015203545920880457, |
|
"std_intvty": 6577.41296145004, |
|
"p99_tpot": 0.010203375200618228, |
|
"p99_intvty": 98.00678504299337, |
|
"mean_itl": 0.0994283992160313, |
|
"median_itl": 0.0991393809672445, |
|
"std_itl": 0.005487413834778653, |
|
"p99_itl": 0.10233251522295177, |
|
"mean_e2el": 9.447980901552365, |
|
"median_e2el": 9.347289866884239, |
|
"std_e2el": 0.5026390952394821, |
|
"p99_e2el": 10.416231317531784 |
|
}, |
|
{ |
|
"hw": "gb200", |
|
"conc": 128, |
|
"image": "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1", |
|
"model": "deepseek-ai/DeepSeek-R1", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "dynamo-sglang", |
|
"precision": "fp8", |
|
"spec_decoding": "none", |
|
"disagg": true, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": true, |
|
"prefill_tp": 1, |
|
"prefill_ep": 1, |
|
"prefill_dp_attention": "true", |
|
"prefill_num_workers": 1, |
|
"decode_tp": 1, |
|
"decode_ep": 1, |
|
"decode_dp_attention": "true", |
|
"decode_num_workers": 4, |
|
"num_prefill_gpu": 4, |
|
"num_decode_gpu": 16, |
|
"tput_per_gpu": 549.2274768221029, |
|
"output_tput_per_gpu": 343.18980477415954, |
|
"input_tput_per_gpu": 1373.378165013876, |
|
"mean_ttft": 1.5116792790606268, |
|
"median_ttft": 0.5904859320726246, |
|
"std_ttft": 2.299484187479841, |
|
"p99_ttft": 9.883723249626348, |
|
"mean_tpot": 0.02010225303510214, |
|
"mean_intvty": 49.74566772459886, |
|
"median_tpot": 0.020274090305293722, |
|
"median_intvty": 49.32403796874143, |
|
"std_tpot": 0.0009383465250459292, |
|
"std_intvty": 1065.704378189127, |
|
"p99_tpot": 0.021193507613449555, |
|
"p99_intvty": 47.18426124826042, |
|
"mean_itl": 0.20003320885117806, |
|
"median_itl": 0.202942225150764, |
|
"std_itl": 0.01937618510530571, |
|
"p99_itl": 0.22431977200787515, |
|
"mean_e2el": 20.019439281575615, |
|
"median_e2el": 19.382260965532623, |
|
"std_e2el": 2.8887589999896885, |
|
"p99_e2el": 29.421967148412953 |
|
}, |
|
{ |
|
"hw": "gb200", |
|
"conc": 4, |
|
"image": "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1", |
|
"model": "deepseek-ai/DeepSeek-R1", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "dynamo-sglang", |
|
"precision": "fp8", |
|
"spec_decoding": "none", |
|
"disagg": true, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": true, |
|
"prefill_tp": 1, |
|
"prefill_ep": 1, |
|
"prefill_dp_attention": "true", |
|
"prefill_num_workers": 1, |
|
"decode_tp": 1, |
|
"decode_ep": 1, |
|
"decode_dp_attention": "true", |
|
"decode_num_workers": 4, |
|
"num_prefill_gpu": 4, |
|
"num_decode_gpu": 16, |
|
"tput_per_gpu": 37.07832552156601, |
|
"output_tput_per_gpu": 23.38776299236849, |
|
"input_tput_per_gpu": 91.84057563835607, |
|
"mean_ttft": 0.2848202320281416, |
|
"median_ttft": 0.269202574971132, |
|
"std_ttft": 0.07170948014447356, |
|
"p99_ttft": 0.44365355880232527, |
|
"mean_tpot": 0.009954524750513092, |
|
"mean_intvty": 100.45682994042045, |
|
"median_tpot": 0.009906810890410695, |
|
"median_intvty": 100.94065699466927, |
|
"std_tpot": 0.0001285214031148149, |
|
"std_intvty": 7780.805187029026, |
|
"p99_tpot": 0.010195688043672196, |
|
"p99_intvty": 98.0806783923362, |
|
"mean_itl": 0.0991256110620117, |
|
"median_itl": 0.09905814949888736, |
|
"std_itl": 0.00498408178976311, |
|
"p99_itl": 0.10972817806759849, |
|
"mean_e2el": 9.582802380016073, |
|
"median_e2el": 9.586307910620235, |
|
"std_e2el": 0.5309077801318303, |
|
"p99_e2el": 10.413642256180756 |
|
}, |
|
{ |
|
"hw": "gb200", |
|
"conc": 2150, |
|
"image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", |
|
"model": "/mnt/lustre01/models/deepseek-r1-0528-fp4-v2", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "dynamo-trt", |
|
"precision": "fp4", |
|
"spec_decoding": "mtp", |
|
"disagg": true, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": true, |
|
"prefill_tp": 4, |
|
"prefill_ep": 4, |
|
"prefill_dp_attention": "true", |
|
"prefill_num_workers": 2, |
|
"decode_tp": 16, |
|
"decode_ep": 16, |
|
"decode_dp_attention": "true", |
|
"decode_num_workers": 1, |
|
"num_prefill_gpu": 8, |
|
"num_decode_gpu": 16, |
|
"tput_per_gpu": 7779.661522604285, |
|
"output_tput_per_gpu": 5837.002387451842, |
|
"input_tput_per_gpu": 11664.979792909171, |
|
"mean_ttft": 2.2882813471733714, |
|
"median_ttft": 1.019085165928118, |
|
"std_ttft": 3.750244053695363, |
|
"p99_ttft": 17.69374180560931, |
|
"mean_tpot": 0.01936886569157753, |
|
"mean_intvty": 51.62924953498159, |
|
"median_tpot": 0.019454735662928227, |
|
"median_intvty": 51.40136660430395, |
|
"std_tpot": 0.001122439813793404, |
|
"std_intvty": 890.9163660369409, |
|
"p99_tpot": 0.021635754025400737, |
|
"p99_intvty": 46.21978965124041, |
|
"mean_itl": 0.6729626038479152, |
|
"median_itl": 0.6757575224619359, |
|
"std_itl": 0.20694689671667543, |
|
"p99_itl": 1.3169609571690672, |
|
"mean_e2el": 20.13102391845835, |
|
"median_e2el": 19.213375322520733, |
|
"std_e2el": 3.8795842324646017, |
|
"p99_e2el": 35.45968775685409 |
|
}, |
|
{ |
|
"hw": "gb200", |
|
"conc": 4096, |
|
"image": "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1", |
|
"model": "deepseek-ai/DeepSeek-R1", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "dynamo-sglang", |
|
"precision": "fp8", |
|
"spec_decoding": "none", |
|
"disagg": true, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": true, |
|
"prefill_tp": 1, |
|
"prefill_ep": 1, |
|
"prefill_dp_attention": "true", |
|
"prefill_num_workers": 2, |
|
"decode_tp": 1, |
|
"decode_ep": 1, |
|
"decode_dp_attention": "true", |
|
"decode_num_workers": 1, |
|
"num_prefill_gpu": 16, |
|
"num_decode_gpu": 32, |
|
"tput_per_gpu": 4244.133401423087, |
|
"output_tput_per_gpu": 3183.7980300916183, |
|
"input_tput_per_gpu": 6364.804144086024, |
|
"mean_ttft": 3.9048184817184164, |
|
"median_ttft": 1.0373276659520343, |
|
"std_ttft": 6.300515198226906, |
|
"p99_ttft": 23.668318262118845, |
|
"mean_tpot": 0.033368430059067054, |
|
"mean_intvty": 29.968446169923254, |
|
"median_tpot": 0.03439645704521005, |
|
"median_intvty": 29.072761729082128, |
|
"std_tpot": 0.0023285690282117276, |
|
"std_intvty": 429.44829544863035, |
|
"p99_tpot": 0.03564511262533135, |
|
"p99_intvty": 28.05433694405964, |
|
"mean_itl": 1.6253092467528782, |
|
"median_itl": 1.699446365935728, |
|
"std_itl": 0.27450068649372333, |
|
"p99_itl": 1.9895442705275492, |
|
"mean_e2el": 34.64720946440609, |
|
"median_e2el": 33.17021796293557, |
|
"std_e2el": 6.438682169857855, |
|
"p99_e2el": 56.18803137874463 |
|
}, |
|
{ |
|
"hw": "h200", |
|
"conc": 32, |
|
"image": "lmsysorg/sglang:v0.5.5-cu129-amd64", |
|
"model": "deepseek-ai/DeepSeek-R1-0528", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "sglang", |
|
"precision": "fp8", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 8, |
|
"ep": 1, |
|
"dp_attention": "false", |
|
"tput_per_gpu": 300.1205120578451, |
|
"output_tput_per_gpu": 150.29797192363262, |
|
"input_tput_per_gpu": 149.82254013421246, |
|
"mean_ttft": 0.29685301923309454, |
|
"median_ttft": 0.17625455418601632, |
|
"std_ttft": 0.2927745539982455, |
|
"p99_ttft": 1.1773992246761917, |
|
"mean_tpot": 0.02580257027532431, |
|
"mean_intvty": 38.75582894764274, |
|
"median_tpot": 0.026118693609020506, |
|
"median_intvty": 38.28675411447968, |
|
"std_tpot": 0.0014320337154244552, |
|
"std_intvty": 698.307581189595, |
|
"p99_tpot": 0.027484693445275318, |
|
"p99_intvty": 36.383887707938115, |
|
"mean_itl": 0.25686865015909, |
|
"median_itl": 0.22214747732505202, |
|
"std_itl": 0.08631182474967977, |
|
"p99_itl": 0.6101080379541953, |
|
"mean_e2el": 24.123025546537246, |
|
"median_e2el": 24.249714117031544, |
|
"std_e2el": 2.1815695777601647, |
|
"p99_e2el": 27.914561686739326 |
|
}, |
|
{ |
|
"hw": "gb200", |
|
"conc": 4300, |
|
"image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", |
|
"model": "/mnt/lustre01/models/deepseek-r1-0528-fp4-v2", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "dynamo-trt", |
|
"precision": "fp4", |
|
"spec_decoding": "none", |
|
"disagg": true, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": true, |
|
"prefill_tp": 4, |
|
"prefill_ep": 4, |
|
"prefill_dp_attention": "true", |
|
"prefill_num_workers": 1, |
|
"decode_tp": 8, |
|
"decode_ep": 8, |
|
"decode_dp_attention": "true", |
|
"decode_num_workers": 1, |
|
"num_prefill_gpu": 4, |
|
"num_decode_gpu": 8, |
|
"tput_per_gpu": 10174.958034288868, |
|
"output_tput_per_gpu": 7628.433424529941, |
|
"input_tput_per_gpu": 15268.007253806723, |
|
"mean_ttft": 13.670401084246047, |
|
"median_ttft": 9.438384462031536, |
|
"std_ttft": 10.635169828625827, |
|
"p99_ttft": 60.16970292568674, |
|
"mean_tpot": 0.05262665971987734, |
|
"mean_intvty": 19.001776007119357, |
|
"median_tpot": 0.05425011406303042, |
|
"median_intvty": 18.433140967006103, |
|
"std_tpot": 0.004045703804077536, |
|
"std_intvty": 247.1757816259638, |
|
"p99_tpot": 0.05513972115032886, |
|
"p99_intvty": 18.135746411804913, |
|
"mean_itl": 1.0405354507394053, |
|
"median_itl": 1.0769842360168695, |
|
"std_itl": 0.27911524862124326, |
|
"p99_itl": 1.8041910778265446, |
|
"mean_e2el": 62.106449018242756, |
|
"median_e2el": 60.163522047922015, |
|
"std_e2el": 10.654577982872771, |
|
"p99_e2el": 106.53037952416811 |
|
}, |
|
{ |
|
"hw": "b200-trt", |
|
"conc": 8, |
|
"image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2", |
|
"model": "nvidia/DeepSeek-R1-0528-FP4-V2", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "trt", |
|
"precision": "fp4", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 4, |
|
"ep": 1, |
|
"dp_attention": "false", |
|
"tput_per_gpu": 378.2064753362504, |
|
"output_tput_per_gpu": 189.8042206156671, |
|
"input_tput_per_gpu": 188.40225472058327, |
|
"mean_ttft": 0.12040181559859775, |
|
"median_ttft": 0.08387166541069746, |
|
"std_ttft": 0.1144210309287736, |
|
"p99_ttft": 0.6233258621720598, |
|
"mean_tpot": 0.01020989147846824, |
|
"mean_intvty": 97.94423399199803, |
|
"median_tpot": 0.010219971794660134, |
|
"median_intvty": 97.84762816297528, |
|
"std_tpot": 0.00023104219384646738, |
|
"std_intvty": 4328.213748976614, |
|
"p99_tpot": 0.01052341271480117, |
|
"p99_intvty": 95.02620747673431, |
|
"mean_itl": 0.10152367619137469, |
|
"median_itl": 0.0983569361269474, |
|
"std_itl": 0.017454745895662968, |
|
"p99_itl": 0.16021439103409646, |
|
"mean_e2el": 9.598905873141485, |
|
"median_e2el": 9.645646893884987, |
|
"std_e2el": 0.6771438655465893, |
|
"p99_e2el": 10.754188930792735 |
|
}, |
|
{ |
|
"hw": "b200-trt", |
|
"conc": 128, |
|
"image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2", |
|
"model": "nvidia/DeepSeek-R1-0528-FP4-V2", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "trt", |
|
"precision": "fp4", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 4, |
|
"ep": 4, |
|
"dp_attention": "false", |
|
"tput_per_gpu": 2114.1922786646155, |
|
"output_tput_per_gpu": 1055.9362782453095, |
|
"input_tput_per_gpu": 1058.256000419306, |
|
"mean_ttft": 0.6532843066768692, |
|
"median_ttft": 0.16898740082979202, |
|
"std_ttft": 1.7107521341864345, |
|
"p99_ttft": 8.835083985212261, |
|
"mean_tpot": 0.028867022477270004, |
|
"mean_intvty": 34.64160534005208, |
|
"median_tpot": 0.029129463913691325, |
|
"median_intvty": 34.329502354143344, |
|
"std_tpot": 0.0016714122095771253, |
|
"std_intvty": 598.29645509949, |
|
"p99_tpot": 0.03221425186577414, |
|
"p99_intvty": 31.042161220029595, |
|
"mean_itl": 0.28714681447905144, |
|
"median_itl": 0.24296033149585128, |
|
"std_itl": 0.08444705281238274, |
|
"p99_itl": 0.6248380776215343, |
|
"mean_e2el": 27.198661110355896, |
|
"median_e2el": 26.954308703076094, |
|
"std_e2el": 3.157631187470903, |
|
"p99_e2el": 37.42864547606092 |
|
}, |
|
{ |
|
"hw": "h200", |
|
"conc": 8, |
|
"image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2", |
|
"model": "deepseek-ai/DeepSeek-R1-0528", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "trt", |
|
"precision": "fp8", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 8, |
|
"ep": 8, |
|
"dp_attention": "false", |
|
"tput_per_gpu": 117.32871414110747, |
|
"output_tput_per_gpu": 58.88181878322487, |
|
"input_tput_per_gpu": 58.446895357882596, |
|
"mean_ttft": 0.29144007361028346, |
|
"median_ttft": 0.10553608369082212, |
|
"std_ttft": 0.5253226253779885, |
|
"p99_ttft": 2.00246681207791, |
|
"mean_tpot": 0.016347644074159503, |
|
"mean_intvty": 61.17089382810128, |
|
"median_tpot": 0.016349283292947513, |
|
"median_intvty": 61.16476068595397, |
|
"std_tpot": 0.0004821108343655462, |
|
"std_intvty": 2074.21183827157, |
|
"p99_tpot": 0.017890705160934297, |
|
"p99_intvty": 55.89494606303028, |
|
"mean_itl": 0.1625246226062859, |
|
"median_itl": 0.15966126322746277, |
|
"std_itl": 0.04237352606898844, |
|
"p99_itl": 0.22143338106572627, |
|
"mean_e2el": 15.465144980861805, |
|
"median_e2el": 15.544434817507863, |
|
"std_e2el": 1.2262121790632936, |
|
"p99_e2el": 17.878225984070447 |
|
}, |
|
{ |
|
"hw": "b200", |
|
"conc": 64, |
|
"image": "lmsysorg/sglang:v0.5.5-cu129-amd64", |
|
"model": "deepseek-ai/DeepSeek-R1-0528", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "sglang", |
|
"precision": "fp8", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 8, |
|
"ep": 1, |
|
"dp_attention": "false", |
|
"tput_per_gpu": 601.2893045180805, |
|
"output_tput_per_gpu": 300.7469428036258, |
|
"input_tput_per_gpu": 300.54236171445467, |
|
"mean_ttft": 1.3847566446684505, |
|
"median_ttft": 0.48940439452417195, |
|
"std_ttft": 7.9552599064731675, |
|
"p99_ttft": 74.83421286032069, |
|
"mean_tpot": 0.024963507826510224, |
|
"mean_intvty": 40.05847282960934, |
|
"median_tpot": 0.024399550985048934, |
|
"median_intvty": 40.98436076191566, |
|
"std_tpot": 0.007326251005332097, |
|
"std_intvty": 136.4954598569163, |
|
"p99_tpot": 0.02635633873325382, |
|
"p99_intvty": 37.9415369532453, |
|
"mean_itl": 0.7371267286918883, |
|
"median_itl": 0.7205090830102563, |
|
"std_itl": 1.1897168318420006, |
|
"p99_itl": 0.9564615071751177, |
|
"mean_e2el": 24.362148402571258, |
|
"median_e2el": 22.92119104496669, |
|
"std_e2el": 10.507153438890304, |
|
"p99_e2el": 97.91282886922126 |
|
}, |
|
{ |
|
"hw": "h200", |
|
"conc": 4, |
|
"image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2", |
|
"model": "deepseek-ai/DeepSeek-R1-0528", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "trt", |
|
"precision": "fp8", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 8, |
|
"ep": 8, |
|
"dp_attention": "false", |
|
"tput_per_gpu": 69.00918094284734, |
|
"output_tput_per_gpu": 34.33669061131495, |
|
"input_tput_per_gpu": 34.67249033153239, |
|
"mean_ttft": 0.31327792736701665, |
|
"median_ttft": 0.12323521776124835, |
|
"std_ttft": 0.5209406402374375, |
|
"p99_ttft": 1.9304639371391386, |
|
"mean_tpot": 0.013820446557735313, |
|
"mean_intvty": 72.35656212861656, |
|
"median_tpot": 0.013833377276012283, |
|
"median_intvty": 72.28892699500405, |
|
"std_tpot": 0.00028118418017168386, |
|
"std_intvty": 3556.38784297689, |
|
"p99_tpot": 0.014359761685012086, |
|
"p99_intvty": 69.63903872052028, |
|
"mean_itl": 0.13732033936355784, |
|
"median_itl": 0.13542452454566956, |
|
"std_itl": 0.02687858623785908, |
|
"p99_itl": 0.22157836593687535, |
|
"mean_e2el": 12.984511947724968, |
|
"median_e2el": 13.041614898014814, |
|
"std_e2el": 1.0617976522561416, |
|
"p99_e2el": 15.736130541441963 |
|
}, |
|
{ |
|
"hw": "mi355x", |
|
"conc": 8, |
|
"image": "rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915", |
|
"model": "deepseek-ai/DeepSeek-R1-0528", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "sglang", |
|
"precision": "fp8", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 8, |
|
"ep": 1, |
|
"dp_attention": "false", |
|
"tput_per_gpu": 98.6450892768569, |
|
"output_tput_per_gpu": 49.45462028218766, |
|
"input_tput_per_gpu": 49.19046899466923, |
|
"mean_ttft": 0.10774247066983662, |
|
"median_ttft": 0.10164526349399239, |
|
"std_ttft": 0.033814203115657374, |
|
"p99_ttft": 0.2408309916345737, |
|
"mean_tpot": 0.0199555036377042, |
|
"mean_intvty": 50.11148894837143, |
|
"median_tpot": 0.019966499546151303, |
|
"median_intvty": 50.08389165504765, |
|
"std_tpot": 0.00017415392062817336, |
|
"std_intvty": 5742.0470144628325, |
|
"p99_tpot": 0.020283994796128715, |
|
"p99_intvty": 49.29995348800101, |
|
"mean_itl": 0.019956584902269463, |
|
"median_itl": 0.019498449997627176, |
|
"std_itl": 0.004403332083795464, |
|
"p99_itl": 0.03609780419385057, |
|
"mean_e2el": 18.573071370802428, |
|
"median_e2el": 18.57070857749204, |
|
"std_e2el": 1.14884731113673, |
|
"p99_e2el": 20.572804990474832 |
|
}, |
|
{ |
|
"hw": "gb200", |
|
"conc": 1075, |
|
"image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", |
|
"model": "/mnt/lustre01/models/deepseek-r1-0528-fp4-v2", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "dynamo-trt", |
|
"precision": "fp4", |
|
"spec_decoding": "none", |
|
"disagg": true, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": true, |
|
"prefill_tp": 4, |
|
"prefill_ep": 4, |
|
"prefill_dp_attention": "true", |
|
"prefill_num_workers": 1, |
|
"decode_tp": 32, |
|
"decode_ep": 32, |
|
"decode_dp_attention": "true", |
|
"decode_num_workers": 1, |
|
"num_prefill_gpu": 4, |
|
"num_decode_gpu": 32, |
|
"tput_per_gpu": 2792.6246801122975, |
|
"output_tput_per_gpu": 1571.108202945853, |
|
"input_tput_per_gpu": 12564.756497443854, |
|
"mean_ttft": 1.837948133694739, |
|
"median_ttft": 0.8739883714588359, |
|
"std_ttft": 2.9986676734966005, |
|
"p99_ttft": 14.633168410130311, |
|
"mean_tpot": 0.018058968118651802, |
|
"mean_intvty": 55.3741494768559, |
|
"median_tpot": 0.018022274523237132, |
|
"median_intvty": 55.48689199638169, |
|
"std_tpot": 0.0002463139287413071, |
|
"std_intvty": 4059.859729046248, |
|
"p99_tpot": 0.0186554579039249, |
|
"p99_intvty": 53.60361590425562, |
|
"mean_itl": 0.35705229326821364, |
|
"median_itl": 0.3583208420313895, |
|
"std_itl": 0.09816686150576326, |
|
"p99_itl": 0.6723199915746227, |
|
"mean_e2el": 18.468073763782836, |
|
"median_e2el": 17.743848473532125, |
|
"std_e2el": 3.1805656680907304, |
|
"p99_e2el": 31.626253874243712 |
|
}, |
|
{ |
|
"hw": "mi355x", |
|
"conc": 8, |
|
"image": "rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915", |
|
"model": "amd/DeepSeek-R1-0528-MXFP4-Preview", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "sglang", |
|
"precision": "fp4", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 4, |
|
"ep": 1, |
|
"dp_attention": "false", |
|
"tput_per_gpu": 203.81281760259895, |
|
"output_tput_per_gpu": 102.17929323263384, |
|
"input_tput_per_gpu": 101.63352436996512, |
|
"mean_ttft": 0.21936451276749722, |
|
"median_ttft": 0.12129646299581509, |
|
"std_ttft": 0.32991483535045485, |
|
"p99_ttft": 2.612228669367905, |
|
"mean_tpot": 0.019218743683179437, |
|
"mean_intvty": 52.03253742726256, |
|
"median_tpot": 0.01922359228998554, |
|
"median_intvty": 52.01941369308724, |
|
"std_tpot": 0.0004241450355786217, |
|
"std_intvty": 2357.684084727746, |
|
"p99_tpot": 0.020383353072807277, |
|
"p99_intvty": 49.0596417786662, |
|
"mean_itl": 0.019222518044605737, |
|
"median_itl": 0.018334227002924308, |
|
"std_itl": 0.012679762219095702, |
|
"p99_itl": 0.03890847842849324, |
|
"mean_e2el": 18.005479660739876, |
|
"median_e2el": 17.978295980996336, |
|
"std_e2el": 1.2369716478427177, |
|
"p99_e2el": 20.307550822279293 |
|
}, |
|
{ |
|
"hw": "b200", |
|
"conc": 4, |
|
"image": "lmsysorg/sglang:v0.5.5-cu129-amd64", |
|
"model": "nvidia/DeepSeek-R1-0528-FP4-V2", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "sglang", |
|
"precision": "fp4", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 8, |
|
"ep": 8, |
|
"dp_attention": "false", |
|
"tput_per_gpu": 120.24509077939304, |
|
"output_tput_per_gpu": 59.82998817274678, |
|
"input_tput_per_gpu": 60.41510260664626, |
|
"mean_ttft": 0.2297067509847693, |
|
"median_ttft": 0.15984388394281268, |
|
"std_ttft": 0.2251969081523954, |
|
"p99_ttft": 0.962432309994474, |
|
"mean_tpot": 0.007868964688318408, |
|
"mean_intvty": 127.08152083647222, |
|
"median_tpot": 0.00786640596167858, |
|
"median_intvty": 127.12285697833653, |
|
"std_tpot": 0.00015613741143868375, |
|
"std_intvty": 6404.614952853288, |
|
"p99_tpot": 0.008283681192286757, |
|
"p99_intvty": 120.71927646505, |
|
"mean_itl": 0.07833881631160375, |
|
"median_itl": 0.07552170264534652, |
|
"std_itl": 0.019457931635451328, |
|
"p99_itl": 0.1676410868158564, |
|
"mean_e2el": 7.444711529940832, |
|
"median_e2el": 7.45253109629266, |
|
"std_e2el": 0.6123301842447513, |
|
"p99_e2el": 9.126274410281331 |
|
}, |
|
{ |
|
"hw": "b200", |
|
"conc": 8, |
|
"image": "lmsysorg/sglang:v0.5.5-cu129-amd64", |
|
"model": "nvidia/DeepSeek-R1-0528-FP4-V2", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "sglang", |
|
"precision": "fp4", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 8, |
|
"ep": 8, |
|
"dp_attention": "false", |
|
"tput_per_gpu": 206.24243122047466, |
|
"output_tput_per_gpu": 103.50347354809152, |
|
"input_tput_per_gpu": 102.73895767238314, |
|
"mean_ttft": 0.27022171078715473, |
|
"median_ttft": 0.16239808546379209, |
|
"std_ttft": 0.30342116897091187, |
|
"p99_ttft": 1.1751088054012506, |
|
"mean_tpot": 0.00916415701088168, |
|
"mean_intvty": 109.1207842480855, |
|
"median_tpot": 0.00915676768178727, |
|
"median_intvty": 109.20884254702575, |
|
"std_tpot": 0.00029198749857808394, |
|
"std_intvty": 3424.804160691071, |
|
"p99_tpot": 0.00999270824329895, |
|
"p99_intvty": 100.07297077552465, |
|
"mean_itl": 0.09120682995429168, |
|
"median_itl": 0.08455537562258542, |
|
"std_itl": 0.03277008963791707, |
|
"p99_itl": 0.18098834120202809, |
|
"mean_e2el": 8.777538600657135, |
|
"median_e2el": 8.742960831848904, |
|
"std_e2el": 0.6976562064184395, |
|
"p99_e2el": 10.137662594267166 |
|
}, |
|
{ |
|
"hw": "b200", |
|
"conc": 32, |
|
"image": "lmsysorg/sglang:v0.5.5-cu129-amd64", |
|
"model": "deepseek-ai/DeepSeek-R1-0528", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "sglang", |
|
"precision": "fp8", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 8, |
|
"ep": 1, |
|
"dp_attention": "false", |
|
"tput_per_gpu": 439.1887015694889, |
|
"output_tput_per_gpu": 219.39400239211665, |
|
"input_tput_per_gpu": 219.79469917737225, |
|
"mean_ttft": 0.43220331492339026, |
|
"median_ttft": 0.38396384846419096, |
|
"std_ttft": 0.34183205058326116, |
|
"p99_ttft": 2.513194294311106, |
|
"mean_tpot": 0.01766089295007497, |
|
"mean_intvty": 56.62227854655305, |
|
"median_tpot": 0.017743388097921045, |
|
"median_intvty": 56.35902199068553, |
|
"std_tpot": 0.00048603618073616797, |
|
"std_intvty": 2057.459999141142, |
|
"p99_tpot": 0.018570088737930604, |
|
"p99_intvty": 53.85003885078026, |
|
"mean_itl": 0.5218189918077257, |
|
"median_itl": 0.5429791060741991, |
|
"std_itl": 0.09011961194626975, |
|
"p99_itl": 0.6960355830844491, |
|
"mean_e2el": 16.67577611648565, |
|
"median_e2el": 16.635727596236393, |
|
"std_e2el": 1.258674462950328, |
|
"p99_e2el": 19.447309473971835 |
|
}, |
|
{ |
|
"hw": "mi300x", |
|
"conc": 32, |
|
"image": "rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915", |
|
"model": "deepseek-ai/DeepSeek-R1-0528", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "sglang", |
|
"precision": "fp8", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 8, |
|
"ep": 1, |
|
"dp_attention": "false", |
|
"tput_per_gpu": 240.28322894738753, |
|
"output_tput_per_gpu": 120.33193516307752, |
|
"input_tput_per_gpu": 119.95129378431001, |
|
"mean_ttft": 0.43875684653175995, |
|
"median_ttft": 0.18919912353157997, |
|
"std_ttft": 0.7096973220616996, |
|
"p99_ttft": 2.6078403209522367, |
|
"mean_tpot": 0.031945752959704254, |
|
"mean_intvty": 31.303065583127132, |
|
"median_tpot": 0.03234773830294439, |
|
"median_intvty": 30.914062387755155, |
|
"std_tpot": 0.0014172512424314118, |
|
"std_intvty": 705.5911965788206, |
|
"p99_tpot": 0.03365913397600829, |
|
"p99_intvty": 29.709617624529034, |
|
"mean_itl": 0.031964843734840896, |
|
"median_itl": 0.028010625392198563, |
|
"std_itl": 0.022002877340968285, |
|
"p99_itl": 0.1586702957749367, |
|
"mean_e2el": 29.932418106409024, |
|
"median_e2el": 30.013407521881163, |
|
"std_e2el": 2.425361545144931, |
|
"p99_e2el": 34.3935982822068 |
|
}, |
|
{ |
|
"hw": "b200", |
|
"conc": 4, |
|
"image": "lmsysorg/sglang:v0.5.5-cu129-amd64", |
|
"model": "nvidia/DeepSeek-R1-0528-FP4-V2", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "sglang", |
|
"precision": "fp4", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 4, |
|
"ep": 4, |
|
"dp_attention": "false", |
|
"tput_per_gpu": 221.12202772297672, |
|
"output_tput_per_gpu": 110.02302229262172, |
|
"input_tput_per_gpu": 111.099005430355, |
|
"mean_ttft": 0.20701409892644734, |
|
"median_ttft": 0.16666134190745652, |
|
"std_ttft": 0.1535687122626908, |
|
"p99_ttft": 0.7148557073948905, |
|
"mean_tpot": 0.008610545537568204, |
|
"mean_intvty": 116.13666005678203, |
|
"median_tpot": 0.008633882844184435, |
|
"median_intvty": 115.82274372341926, |
|
"std_tpot": 0.00016951327722173728, |
|
"std_intvty": 5899.242917072023, |
|
"p99_tpot": 0.008966876285553389, |
|
"p99_intvty": 111.52155646566783, |
|
"mean_itl": 0.08571590155892198, |
|
"median_itl": 0.08308944059535861, |
|
"std_itl": 0.01901525995653771, |
|
"p99_itl": 0.17541313607245684, |
|
"mean_e2el": 8.101448465150316, |
|
"median_e2el": 8.142378360033035, |
|
"std_e2el": 0.6163142913619017, |
|
"p99_e2el": 9.565222159053665 |
|
}, |
|
{ |
|
"hw": "b200-trt", |
|
"conc": 64, |
|
"image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2", |
|
"model": "nvidia/DeepSeek-R1-0528-FP4-V2", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "trt", |
|
"precision": "fp4", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 4, |
|
"ep": 4, |
|
"dp_attention": "false", |
|
"tput_per_gpu": 1366.5673523618073, |
|
"output_tput_per_gpu": 683.129672362983, |
|
"input_tput_per_gpu": 683.4376799988243, |
|
"mean_ttft": 0.4568303398431908, |
|
"median_ttft": 0.11456687549070921, |
|
"std_ttft": 1.0374367950486671, |
|
"p99_ttft": 5.381849112740894, |
|
"mean_tpot": 0.02234443699234901, |
|
"mean_intvty": 44.75386872993987, |
|
"median_tpot": 0.022562204283499206, |
|
"median_intvty": 44.32191054715991, |
|
"std_tpot": 0.0011427359356261106, |
|
"std_intvty": 875.0928091292544, |
|
"p99_tpot": 0.02448816586672067, |
|
"p99_intvty": 40.83605139897376, |
|
"mean_itl": 0.22236324790509093, |
|
"median_itl": 0.18635848199483007, |
|
"std_itl": 0.07782023240492508, |
|
"p99_itl": 0.5049286349746398, |
|
"mean_e2el": 21.05218365431306, |
|
"median_e2el": 21.059494054497918, |
|
"std_e2el": 2.166727250854411, |
|
"p99_e2el": 27.261061567020082 |
|
}, |
|
{ |
|
"hw": "mi325x", |
|
"conc": 16, |
|
"image": "rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915", |
|
"model": "deepseek-ai/DeepSeek-R1-0528", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "sglang", |
|
"precision": "fp8", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 8, |
|
"ep": 1, |
|
"dp_attention": "false", |
|
"tput_per_gpu": 153.77998713545853, |
|
"output_tput_per_gpu": 76.48290034411293, |
|
"input_tput_per_gpu": 77.29708679134559, |
|
"mean_ttft": 0.2514936144165404, |
|
"median_ttft": 0.1376320410054177, |
|
"std_ttft": 0.3493596041451523, |
|
"p99_ttft": 1.3411809504963457, |
|
"mean_tpot": 0.025269813441890478, |
|
"mean_intvty": 39.57290790054952, |
|
"median_tpot": 0.025379357224639104, |
|
"median_intvty": 39.40210113080278, |
|
"std_tpot": 0.0004439990990953196, |
|
"std_intvty": 2252.25682222683, |
|
"p99_tpot": 0.025863465503007717, |
|
"p99_intvty": 38.66457880069119, |
|
"mean_itl": 0.02527630599983262, |
|
"median_itl": 0.023918416001833975, |
|
"std_itl": 0.00893579474326899, |
|
"p99_itl": 0.08887578076450145, |
|
"mean_e2el": 23.376627623062813, |
|
"median_e2el": 23.35095692804316, |
|
"std_e2el": 1.6821515116766381, |
|
"p99_e2el": 26.560598223117413 |
|
}, |
|
{ |
|
"hw": "mi300x", |
|
"conc": 8, |
|
"image": "rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915", |
|
"model": "deepseek-ai/DeepSeek-R1-0528", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "sglang", |
|
"precision": "fp8", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 8, |
|
"ep": 1, |
|
"dp_attention": "false", |
|
"tput_per_gpu": 83.21387824915277, |
|
"output_tput_per_gpu": 41.761171041413945, |
|
"input_tput_per_gpu": 41.452707207738825, |
|
"mean_ttft": 0.2894865931244567, |
|
"median_ttft": 0.17828640108928084, |
|
"std_ttft": 0.3526266750443899, |
|
"p99_ttft": 1.428675928255543, |
|
"mean_tpot": 0.02311063989057251, |
|
"mean_intvty": 43.270113018719506, |
|
"median_tpot": 0.023170519148420894, |
|
"median_intvty": 43.15829065349843, |
|
"std_tpot": 0.00038067810625572906, |
|
"std_intvty": 2626.89128575266, |
|
"p99_tpot": 0.023844030985488558, |
|
"p99_intvty": 41.939217433855816, |
|
"mean_itl": 0.02311783327041665, |
|
"median_itl": 0.02216945542022586, |
|
"std_itl": 0.011508656619891719, |
|
"p99_itl": 0.0254195882473141, |
|
"mean_e2el": 21.74456925011473, |
|
"median_e2el": 21.94070298410952, |
|
"std_e2el": 1.5284897199732408, |
|
"p99_e2el": 24.42748722226359 |
|
}, |
|
{ |
|
"hw": "b200-trt", |
|
"conc": 32, |
|
"image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2", |
|
"model": "deepseek-ai/DeepSeek-R1-0528", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "trt", |
|
"precision": "fp8", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 8, |
|
"ep": 8, |
|
"dp_attention": "false", |
|
"tput_per_gpu": 342.785513796456, |
|
"output_tput_per_gpu": 171.6642663813588, |
|
"input_tput_per_gpu": 171.12124741509717, |
|
"mean_ttft": 0.3772049359686207, |
|
"median_ttft": 0.1376246064901352, |
|
"std_ttft": 0.7399091392518318, |
|
"p99_ttft": 3.7817022288544107, |
|
"mean_tpot": 0.022346557272562843, |
|
"mean_intvty": 44.74962240504949, |
|
"median_tpot": 0.022482151086371707, |
|
"median_intvty": 44.47972954893016, |
|
"std_tpot": 0.0008080899146106663, |
|
"std_intvty": 1237.4860543604175, |
|
"p99_tpot": 0.023623484823969695, |
|
"p99_intvty": 42.330757187243805, |
|
"mean_itl": 0.2222407169017928, |
|
"median_itl": 0.19886512658558786, |
|
"std_itl": 0.06334746271004797, |
|
"p99_itl": 0.4483058636356144, |
|
"mean_e2el": 21.003921266173712, |
|
"median_e2el": 20.98152683977969, |
|
"std_e2el": 1.8073383647896561, |
|
"p99_e2el": 26.16535166225862 |
|
}, |
|
{ |
|
"hw": "gb200", |
|
"conc": 1075, |
|
"image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", |
|
"model": "/mnt/lustre01/models/deepseek-r1-0528-fp4-v2", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "dynamo-trt", |
|
"precision": "fp4", |
|
"spec_decoding": "none", |
|
"disagg": true, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": true, |
|
"prefill_tp": 4, |
|
"prefill_ep": 4, |
|
"prefill_dp_attention": "true", |
|
"prefill_num_workers": 1, |
|
"decode_tp": 16, |
|
"decode_ep": 16, |
|
"decode_dp_attention": "true", |
|
"decode_num_workers": 1, |
|
"num_prefill_gpu": 4, |
|
"num_decode_gpu": 16, |
|
"tput_per_gpu": 4442.0033986024655, |
|
"output_tput_per_gpu": 2776.706016926881, |
|
"input_tput_per_gpu": 11103.192925304802, |
|
"mean_ttft": 1.9480372721163643, |
|
"median_ttft": 0.9353012915235013, |
|
"std_ttft": 3.0433942664282787, |
|
"p99_ttft": 14.64567912986735, |
|
"mean_tpot": 0.020550563984298366, |
|
"mean_intvty": 48.66046502490388, |
|
"median_tpot": 0.020383988802661422, |
|
"median_intvty": 49.058111720971695, |
|
"std_tpot": 0.0004260433798055789, |
|
"std_intvty": 2347.1788259128475, |
|
"p99_tpot": 0.021687672736124822, |
|
"p99_intvty": 46.10914283736472, |
|
"mean_itl": 0.4063176843756857, |
|
"median_itl": 0.4049514851067215, |
|
"std_itl": 0.09347745592232594, |
|
"p99_itl": 0.716668958151713, |
|
"mean_e2el": 20.872755769070423, |
|
"median_e2el": 20.183781350497156, |
|
"std_e2el": 3.237900787598791, |
|
"p99_e2el": 34.04821030642604 |
|
}, |
|
{ |
|
"hw": "gb200", |
|
"conc": 4, |
|
"image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", |
|
"model": "/mnt/lustre01/models/deepseek-r1-0528-fp4-v2", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "dynamo-trt", |
|
"precision": "fp4", |
|
"spec_decoding": "none", |
|
"disagg": true, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": true, |
|
"prefill_tp": 4, |
|
"prefill_ep": 4, |
|
"prefill_dp_attention": "false", |
|
"prefill_num_workers": 1, |
|
"decode_tp": 8, |
|
"decode_ep": 8, |
|
"decode_dp_attention": "false", |
|
"decode_num_workers": 4, |
|
"num_prefill_gpu": 4, |
|
"num_decode_gpu": 32, |
|
"tput_per_gpu": 32.30548848847, |
|
"output_tput_per_gpu": 18.242645000595903, |
|
"input_tput_per_gpu": 144.80823639146277, |
|
"mean_ttft": 0.14840737418853678, |
|
"median_ttft": 0.14289077941793948, |
|
"std_ttft": 0.042909841580251844, |
|
"p99_ttft": 0.2658678555791266, |
|
"mean_tpot": 0.006387308303465267, |
|
"mean_intvty": 156.56047156162418, |
|
"median_tpot": 0.0062471594259788585, |
|
"median_intvty": 160.07275176002275, |
|
"std_tpot": 0.00034853667305162386, |
|
"std_intvty": 2869.138536396955, |
|
"p99_tpot": 0.007677267151823523, |
|
"p99_intvty": 130.25468310849095, |
|
"mean_itl": 0.12613988418477137, |
|
"median_itl": 0.12492778291925788, |
|
"std_itl": 0.032114731512498454, |
|
"p99_itl": 0.13669693334959454, |
|
"mean_e2el": 6.084865549652022, |
|
"median_e2el": 6.130423433962278, |
|
"std_e2el": 0.5584283270248697, |
|
"p99_e2el": 7.738454438014888 |
|
}, |
|
{ |
|
"hw": "gb200", |
|
"conc": 32, |
|
"image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", |
|
"model": "/mnt/lustre01/models/deepseek-r1-0528-fp4-v2", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "dynamo-trt", |
|
"precision": "fp4", |
|
"spec_decoding": "none", |
|
"disagg": true, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": true, |
|
"prefill_tp": 4, |
|
"prefill_ep": 4, |
|
"prefill_dp_attention": "false", |
|
"prefill_num_workers": 1, |
|
"decode_tp": 8, |
|
"decode_ep": 8, |
|
"decode_dp_attention": "false", |
|
"decode_num_workers": 4, |
|
"num_prefill_gpu": 4, |
|
"num_decode_gpu": 32, |
|
"tput_per_gpu": 193.88061955696213, |
|
"output_tput_per_gpu": 108.76266393323671, |
|
"input_tput_per_gpu": 874.8242645467654, |
|
"mean_ttft": 0.2093679561976387, |
|
"median_ttft": 0.15004791400860995, |
|
"std_ttft": 0.17200220740846395, |
|
"p99_ttft": 0.9295085422927503, |
|
"mean_tpot": 0.00860492063866793, |
|
"mean_intvty": 116.2125767327011, |
|
"median_tpot": 0.008492272347939726, |
|
"median_intvty": 117.75411327247478, |
|
"std_tpot": 0.0004083274176912921, |
|
"std_intvty": 2449.0150714200395, |
|
"p99_tpot": 0.010123680577253494, |
|
"p99_intvty": 98.77830423125569, |
|
"mean_itl": 0.17013832593397857, |
|
"median_itl": 0.16807796503417194, |
|
"std_itl": 0.027877639896314154, |
|
"p99_itl": 0.2994745094375641, |
|
"mean_e2el": 8.093551296603437, |
|
"median_e2el": 8.029026238364168, |
|
"std_e2el": 0.6408468581357575, |
|
"p99_e2el": 9.83711937664775 |
|
}, |
|
{ |
|
"hw": "gb200", |
|
"conc": 256, |
|
"image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", |
|
"model": "/mnt/lustre01/models/deepseek-r1-0528-fp4-v2", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "dynamo-trt", |
|
"precision": "fp4", |
|
"spec_decoding": "none", |
|
"disagg": true, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": true, |
|
"prefill_tp": 4, |
|
"prefill_ep": 4, |
|
"prefill_dp_attention": "false", |
|
"prefill_num_workers": 1, |
|
"decode_tp": 8, |
|
"decode_ep": 8, |
|
"decode_dp_attention": "false", |
|
"decode_num_workers": 4, |
|
"num_prefill_gpu": 4, |
|
"num_decode_gpu": 32, |
|
"tput_per_gpu": 889.9624711332857, |
|
"output_tput_per_gpu": 500.31443833525276, |
|
"input_tput_per_gpu": 4007.146733517549, |
|
"mean_ttft": 0.6670898931645297, |
|
"median_ttft": 0.2107954079983756, |
|
"std_ttft": 1.4167201466626331, |
|
"p99_ttft": 6.83380425700685, |
|
"mean_tpot": 0.014547515477535801, |
|
"mean_intvty": 68.7402602557251, |
|
"median_tpot": 0.01411182000639381, |
|
"median_intvty": 70.86258183189115, |
|
"std_tpot": 0.0010825023852577375, |
|
"std_intvty": 923.7854933334911, |
|
"p99_tpot": 0.016755068665706724, |
|
"p99_intvty": 59.68343191852985, |
|
"mean_itl": 0.2875959412064402, |
|
"median_itl": 0.2815297171473503, |
|
"std_itl": 0.03471473059498881, |
|
"p99_itl": 0.3370971063617617, |
|
"mean_e2el": 14.047743736099052, |
|
"median_e2el": 13.754055539960973, |
|
"std_e2el": 1.854379863531056, |
|
"p99_e2el": 20.471407575663175 |
|
}, |
|
{ |
|
"hw": "gb200", |
|
"conc": 16, |
|
"image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", |
|
"model": "/mnt/lustre01/models/deepseek-r1-0528-fp4-v2", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "dynamo-trt", |
|
"precision": "fp4", |
|
"spec_decoding": "none", |
|
"disagg": true, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": true, |
|
"prefill_tp": 4, |
|
"prefill_ep": 4, |
|
"prefill_dp_attention": "false", |
|
"prefill_num_workers": 1, |
|
"decode_tp": 8, |
|
"decode_ep": 8, |
|
"decode_dp_attention": "false", |
|
"decode_num_workers": 4, |
|
"num_prefill_gpu": 4, |
|
"num_decode_gpu": 32, |
|
"tput_per_gpu": 113.05836519386133, |
|
"output_tput_per_gpu": 63.28888713157418, |
|
"input_tput_per_gpu": 511.2141896921585, |
|
"mean_ttft": 0.17807253625869635, |
|
"median_ttft": 0.13083326700143516, |
|
"std_ttft": 0.09339825318332076, |
|
"p99_ttft": 0.5639588387473486, |
|
"mean_tpot": 0.007518409364787082, |
|
"mean_intvty": 133.00685709979555, |
|
"median_tpot": 0.007513453579027073, |
|
"median_intvty": 133.09458686101198, |
|
"std_tpot": 9.004145068098673e-05, |
|
"std_intvty": 11105.996098874064, |
|
"p99_tpot": 0.0077366363256289854, |
|
"p99_intvty": 129.2551385267163, |
|
"mean_itl": 0.1486130380290456, |
|
"median_itl": 0.14964435994625092, |
|
"std_itl": 0.013255756954381144, |
|
"p99_itl": 0.16641649325378238, |
|
"mean_e2el": 7.073485194661771, |
|
"median_e2el": 7.088631398044527, |
|
"std_e2el": 0.4819814051232923, |
|
"p99_e2el": 7.937781208301894 |
|
}, |
|
{ |
|
"hw": "gb200", |
|
"conc": 64, |
|
"image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", |
|
"model": "/mnt/lustre01/models/deepseek-r1-0528-fp4-v2", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "dynamo-trt", |
|
"precision": "fp4", |
|
"spec_decoding": "none", |
|
"disagg": true, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": true, |
|
"prefill_tp": 4, |
|
"prefill_ep": 4, |
|
"prefill_dp_attention": "false", |
|
"prefill_num_workers": 1, |
|
"decode_tp": 8, |
|
"decode_ep": 8, |
|
"decode_dp_attention": "false", |
|
"decode_num_workers": 4, |
|
"num_prefill_gpu": 4, |
|
"num_decode_gpu": 32, |
|
"tput_per_gpu": 331.7084493426723, |
|
"output_tput_per_gpu": 187.106606833859, |
|
"input_tput_per_gpu": 1488.5231894131787, |
|
"mean_ttft": 0.2947931587536914, |
|
"median_ttft": 0.18885245453566313, |
|
"std_ttft": 0.3531773135932712, |
|
"p99_ttft": 1.8415272141131571, |
|
"mean_tpot": 0.010029004650070852, |
|
"mean_intvty": 99.71079233600068, |
|
"median_tpot": 0.010045340462369374, |
|
"median_intvty": 99.54864185500509, |
|
"std_tpot": 0.00016509895971075173, |
|
"std_intvty": 6056.973355567891, |
|
"p99_tpot": 0.010307073854953048, |
|
"p99_intvty": 97.0207465351043, |
|
"mean_itl": 0.19837653581187137, |
|
"median_itl": 0.2000001723645255, |
|
"std_itl": 0.017391809715383795, |
|
"p99_itl": 0.21920554079581053, |
|
"mean_e2el": 9.563471757051047, |
|
"median_e2el": 9.541636548936367, |
|
"std_e2el": 0.6892220220873352, |
|
"p99_e2el": 11.61378111343598 |
|
}, |
|
{ |
|
"hw": "gb200", |
|
"conc": 564, |
|
"image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", |
|
"model": "/mnt/lustre01/models/deepseek-r1-0528-fp4-v2", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "dynamo-trt", |
|
"precision": "fp4", |
|
"spec_decoding": "none", |
|
"disagg": true, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": true, |
|
"prefill_tp": 4, |
|
"prefill_ep": 4, |
|
"prefill_dp_attention": "false", |
|
"prefill_num_workers": 1, |
|
"decode_tp": 8, |
|
"decode_ep": 8, |
|
"decode_dp_attention": "false", |
|
"decode_num_workers": 4, |
|
"num_prefill_gpu": 4, |
|
"num_decode_gpu": 32, |
|
"tput_per_gpu": 1545.4493769534138, |
|
"output_tput_per_gpu": 869.40630670225, |
|
"input_tput_per_gpu": 6953.7939389627245, |
|
"mean_ttft": 2.441518805006113, |
|
"median_ttft": 1.6333737700479105, |
|
"std_ttft": 2.7245178959665166, |
|
"p99_ttft": 14.688298106354196, |
|
"mean_tpot": 0.016616069654606953, |
|
"mean_intvty": 60.18270389969996, |
|
"median_tpot": 0.016725962126458386, |
|
"median_intvty": 59.78729309796324, |
|
"std_tpot": 0.00045934587600503414, |
|
"std_intvty": 2177.008768854258, |
|
"p99_tpot": 0.016970812281506577, |
|
"p99_intvty": 58.92469867748872, |
|
"mean_itl": 0.3284953067926914, |
|
"median_itl": 0.3337935770396143, |
|
"std_itl": 0.03102841840298918, |
|
"p99_itl": 0.3532176049705595, |
|
"mean_e2el": 17.741085684470416, |
|
"median_e2el": 17.13221336551942, |
|
"std_e2el": 2.905664287107915, |
|
"p99_e2el": 29.981312049129052 |
|
}, |
|
{ |
|
"hw": "gb200", |
|
"conc": 128, |
|
"image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", |
|
"model": "/mnt/lustre01/models/deepseek-r1-0528-fp4-v2", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "dynamo-trt", |
|
"precision": "fp4", |
|
"spec_decoding": "none", |
|
"disagg": true, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": true, |
|
"prefill_tp": 4, |
|
"prefill_ep": 4, |
|
"prefill_dp_attention": "false", |
|
"prefill_num_workers": 1, |
|
"decode_tp": 8, |
|
"decode_ep": 8, |
|
"decode_dp_attention": "false", |
|
"decode_num_workers": 4, |
|
"num_prefill_gpu": 4, |
|
"num_decode_gpu": 32, |
|
"tput_per_gpu": 548.3264733520737, |
|
"output_tput_per_gpu": 308.18287091292643, |
|
"input_tput_per_gpu": 2469.4752928652515, |
|
"mean_ttft": 0.41526711276424066, |
|
"median_ttft": 0.19203887740150094, |
|
"std_ttft": 0.7042873918169126, |
|
"p99_ttft": 3.5030307700508256, |
|
"mean_tpot": 0.011994281027247911, |
|
"mean_intvty": 83.37306735837338, |
|
"median_tpot": 0.011786082392241527, |
|
"median_intvty": 84.84583483467536, |
|
"std_tpot": 0.0005816401758990426, |
|
"std_intvty": 1719.2760084949387, |
|
"p99_tpot": 0.013430561973805008, |
|
"p99_intvty": 74.45704818237702, |
|
"mean_itl": 0.23720926229856373, |
|
"median_itl": 0.23530848696827888, |
|
"std_itl": 0.024985655915077898, |
|
"p99_itl": 0.2720074794720858, |
|
"mean_e2el": 11.452215554241093, |
|
"median_e2el": 11.329832831164822, |
|
"std_e2el": 1.1193682218335972, |
|
"p99_e2el": 14.931139022605493 |
|
}, |
|
{ |
|
"hw": "gb200", |
|
"conc": 8, |
|
"image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", |
|
"model": "/mnt/lustre01/models/deepseek-r1-0528-fp4-v2", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "dynamo-trt", |
|
"precision": "fp4", |
|
"spec_decoding": "none", |
|
"disagg": true, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": true, |
|
"prefill_tp": 4, |
|
"prefill_ep": 4, |
|
"prefill_dp_attention": "false", |
|
"prefill_num_workers": 1, |
|
"decode_tp": 8, |
|
"decode_ep": 8, |
|
"decode_dp_attention": "false", |
|
"decode_num_workers": 4, |
|
"num_prefill_gpu": 4, |
|
"num_decode_gpu": 32, |
|
"tput_per_gpu": 61.299110216476706, |
|
"output_tput_per_gpu": 34.5057414434418, |
|
"input_tput_per_gpu": 275.64606040075597, |
|
"mean_ttft": 0.15919374672375852, |
|
"median_ttft": 0.12568423303309828, |
|
"std_ttft": 0.07043458638678603, |
|
"p99_ttft": 0.4057272569951601, |
|
"mean_tpot": 0.006728571503425011, |
|
"mean_intvty": 148.6199558837972, |
|
"median_tpot": 0.006689147116119343, |
|
"median_intvty": 149.49588978095946, |
|
"std_tpot": 0.000165634762200263, |
|
"std_intvty": 6037.379996301356, |
|
"p99_tpot": 0.0074147995356925196, |
|
"p99_intvty": 134.86541277162164, |
|
"mean_itl": 0.13295241337608124, |
|
"median_itl": 0.13338100002147257, |
|
"std_itl": 0.014608512970082246, |
|
"p99_itl": 0.153345231898129, |
|
"mean_e2el": 6.39341539315501, |
|
"median_e2el": 6.405678987503052, |
|
"std_e2el": 0.4058776802296038, |
|
"p99_e2el": 7.189627567902206 |
|
}, |
|
{ |
|
"hw": "b200-trt", |
|
"conc": 256, |
|
"image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2", |
|
"model": "nvidia/DeepSeek-R1-0528-FP4-V2", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "trt", |
|
"precision": "fp4", |
|
"spec_decoding": "none", |
|
"disagg": false, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": false, |
|
"tp": 8, |
|
"ep": 8, |
|
"dp_attention": "true", |
|
"tput_per_gpu": 2128.852677927518, |
|
"output_tput_per_gpu": 1064.940969960399, |
|
"input_tput_per_gpu": 1063.9117079671191, |
|
"mean_ttft": 1.310160240385369, |
|
"median_ttft": 1.168279828998493, |
|
"std_ttft": 0.8978656254236859, |
|
"p99_ttft": 3.9658524450630646, |
|
"mean_tpot": 0.027373944232363702, |
|
"mean_intvty": 36.53108925449328, |
|
"median_tpot": 0.027608027207607314, |
|
"median_intvty": 36.22134940972721, |
|
"std_tpot": 0.0008730665015690951, |
|
"std_intvty": 1145.3881212974923, |
|
"p99_tpot": 0.02854741118424829, |
|
"p99_intvty": 35.02944605189887, |
|
"mean_itl": 0.27226027570796396, |
|
"median_itl": 0.24703382499865256, |
|
"std_itl": 0.06409998257520498, |
|
"p99_itl": 0.5082758385234046, |
|
"mean_e2el": 26.533053960348944, |
|
"median_e2el": 26.538300427011563, |
|
"std_e2el": 2.13421962699385, |
|
"p99_e2el": 31.03455472946865 |
|
}, |
|
{ |
|
"hw": "gb200", |
|
"conc": 8, |
|
"image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", |
|
"model": "/mnt/lustre01/models/deepseek-r1-0528-fp4-v2", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "dynamo-trt", |
|
"precision": "fp4", |
|
"spec_decoding": "mtp", |
|
"disagg": true, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": true, |
|
"prefill_tp": 4, |
|
"prefill_ep": 4, |
|
"prefill_dp_attention": "false", |
|
"prefill_num_workers": 1, |
|
"decode_tp": 8, |
|
"decode_ep": 8, |
|
"decode_dp_attention": "false", |
|
"decode_num_workers": 4, |
|
"num_prefill_gpu": 4, |
|
"num_decode_gpu": 32, |
|
"tput_per_gpu": 99.51521099812607, |
|
"output_tput_per_gpu": 56.01787902408906, |
|
"input_tput_per_gpu": 447.49386679042215, |
|
"mean_ttft": 0.18750758845635573, |
|
"median_ttft": 0.1494649270316586, |
|
"std_ttft": 0.06651220631348966, |
|
"p99_ttft": 0.40635371166281403, |
|
"mean_tpot": 0.004036221760066496, |
|
"mean_intvty": 247.75645627150206, |
|
"median_tpot": 0.003986540320365812, |
|
"median_intvty": 250.84407020577635, |
|
"std_tpot": 0.0004519423185463171, |
|
"std_intvty": 2212.6717480596267, |
|
"p99_tpot": 0.005237616471229547, |
|
"p99_intvty": 190.92654177583316, |
|
"mean_itl": 0.18209876070071734, |
|
"median_itl": 0.18269856786355376, |
|
"std_itl": 0.04708065180617375, |
|
"p99_itl": 0.23990691555198207, |
|
"mean_e2el": 3.9290679442383407, |
|
"median_e2el": 3.8794775566784665, |
|
"std_e2el": 0.4844947665131011, |
|
"p99_e2el": 5.294441736566366 |
|
}, |
|
{ |
|
"hw": "gb200", |
|
"conc": 16, |
|
"image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", |
|
"model": "/mnt/lustre01/models/deepseek-r1-0528-fp4-v2", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "dynamo-trt", |
|
"precision": "fp4", |
|
"spec_decoding": "mtp", |
|
"disagg": true, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": true, |
|
"prefill_tp": 4, |
|
"prefill_ep": 4, |
|
"prefill_dp_attention": "false", |
|
"prefill_num_workers": 1, |
|
"decode_tp": 8, |
|
"decode_ep": 8, |
|
"decode_dp_attention": "false", |
|
"decode_num_workers": 4, |
|
"num_prefill_gpu": 4, |
|
"num_decode_gpu": 32, |
|
"tput_per_gpu": 175.52461944172794, |
|
"output_tput_per_gpu": 98.25684114228814, |
|
"input_tput_per_gpu": 793.6668458372463, |
|
"mean_ttft": 0.21040188713413954, |
|
"median_ttft": 0.16469682636670768, |
|
"std_ttft": 0.10812090479076027, |
|
"p99_ttft": 0.6610149128641933, |
|
"mean_tpot": 0.004692196283491124, |
|
"mean_intvty": 213.11981417281467, |
|
"median_tpot": 0.004715714771509992, |
|
"median_intvty": 212.05693059332248, |
|
"std_tpot": 0.00039921021092914204, |
|
"std_intvty": 2504.9459473307293, |
|
"p99_tpot": 0.005432069610460653, |
|
"p99_intvty": 184.09189714253267, |
|
"mean_itl": 0.21232542605911525, |
|
"median_itl": 0.21622406505048275, |
|
"std_itl": 0.02667942989675247, |
|
"p99_itl": 0.244650762360543, |
|
"mean_e2el": 4.511650454967821, |
|
"median_e2el": 4.497550335130654, |
|
"std_e2el": 0.4725469757470022, |
|
"p99_e2el": 5.441198795947712 |
|
}, |
|
{ |
|
"hw": "gb200", |
|
"conc": 64, |
|
"image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", |
|
"model": "/mnt/lustre01/models/deepseek-r1-0528-fp4-v2", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "dynamo-trt", |
|
"precision": "fp4", |
|
"spec_decoding": "mtp", |
|
"disagg": true, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": true, |
|
"prefill_tp": 4, |
|
"prefill_ep": 4, |
|
"prefill_dp_attention": "false", |
|
"prefill_num_workers": 1, |
|
"decode_tp": 8, |
|
"decode_ep": 8, |
|
"decode_dp_attention": "false", |
|
"decode_num_workers": 4, |
|
"num_prefill_gpu": 4, |
|
"num_decode_gpu": 32, |
|
"tput_per_gpu": 479.8916806256891, |
|
"output_tput_per_gpu": 270.69224250272833, |
|
"input_tput_per_gpu": 2153.4871856093755, |
|
"mean_ttft": 0.35485877435303337, |
|
"median_ttft": 0.2365343039855361, |
|
"std_ttft": 0.39787715251864675, |
|
"p99_ttft": 2.092928964558523, |
|
"mean_tpot": 0.006664123922441312, |
|
"mean_intvty": 150.05723357462168, |
|
"median_tpot": 0.006663868170609483, |
|
"median_intvty": 150.06299260396972, |
|
"std_tpot": 0.0007488264325204436, |
|
"std_intvty": 1335.422945253337, |
|
"p99_tpot": 0.00865190497431278, |
|
"p99_intvty": 115.58148210931199, |
|
"mean_itl": 0.3062318582727932, |
|
"median_itl": 0.3048161950428039, |
|
"std_itl": 0.04874367714333303, |
|
"p99_itl": 0.42107235385105024, |
|
"mean_e2el": 6.508803171305317, |
|
"median_e2el": 6.419823716511019, |
|
"std_e2el": 0.8529058887369169, |
|
"p99_e2el": 8.816713589767458 |
|
}, |
|
{ |
|
"hw": "gb200", |
|
"conc": 144, |
|
"image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", |
|
"model": "/mnt/lustre01/models/deepseek-r1-0528-fp4-v2", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "dynamo-trt", |
|
"precision": "fp4", |
|
"spec_decoding": "mtp", |
|
"disagg": true, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": true, |
|
"prefill_tp": 4, |
|
"prefill_ep": 4, |
|
"prefill_dp_attention": "false", |
|
"prefill_num_workers": 1, |
|
"decode_tp": 8, |
|
"decode_ep": 8, |
|
"decode_dp_attention": "false", |
|
"decode_num_workers": 4, |
|
"num_prefill_gpu": 4, |
|
"num_decode_gpu": 32, |
|
"tput_per_gpu": 840.975570182399, |
|
"output_tput_per_gpu": 472.11837840035133, |
|
"input_tput_per_gpu": 3791.83310443878, |
|
"mean_ttft": 1.2457016802365817, |
|
"median_ttft": 0.9568504230119288, |
|
"std_ttft": 1.1520934888264869, |
|
"p99_ttft": 6.275029098873493, |
|
"mean_tpot": 0.007679123476833952, |
|
"mean_intvty": 130.22319578748235, |
|
"median_tpot": 0.007759047048666262, |
|
"median_intvty": 128.88180645481387, |
|
"std_tpot": 0.0007452788364715315, |
|
"std_intvty": 1341.7796817288242, |
|
"p99_tpot": 0.008991732917540182, |
|
"p99_intvty": 111.21326769496224, |
|
"mean_itl": 0.35007406755007664, |
|
"median_itl": 0.3609183810185641, |
|
"std_itl": 0.04687425970042775, |
|
"p99_itl": 0.3739598415605724, |
|
"mean_e2el": 8.292461603375664, |
|
"median_e2el": 8.148701257421635, |
|
"std_e2el": 1.4007224146850852, |
|
"p99_e2el": 13.462928508874029 |
|
}, |
|
{ |
|
"hw": "gb200", |
|
"conc": 32, |
|
"image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", |
|
"model": "/mnt/lustre01/models/deepseek-r1-0528-fp4-v2", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "dynamo-trt", |
|
"precision": "fp4", |
|
"spec_decoding": "mtp", |
|
"disagg": true, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": true, |
|
"prefill_tp": 4, |
|
"prefill_ep": 4, |
|
"prefill_dp_attention": "false", |
|
"prefill_num_workers": 1, |
|
"decode_tp": 8, |
|
"decode_ep": 8, |
|
"decode_dp_attention": "false", |
|
"decode_num_workers": 4, |
|
"num_prefill_gpu": 4, |
|
"num_decode_gpu": 32, |
|
"tput_per_gpu": 294.9592385759157, |
|
"output_tput_per_gpu": 165.465494243536, |
|
"input_tput_per_gpu": 1330.9091932349534, |
|
"mean_ttft": 0.2577960086227904, |
|
"median_ttft": 0.19730710645671934, |
|
"std_ttft": 0.20271399300979337, |
|
"p99_ttft": 1.1087125004036338, |
|
"mean_tpot": 0.005508829952219666, |
|
"mean_intvty": 181.52675044853603, |
|
"median_tpot": 0.005547889245630062, |
|
"median_intvty": 180.24873167532604, |
|
"std_tpot": 0.00046927384983360635, |
|
"std_intvty": 2130.951895901672, |
|
"p99_tpot": 0.00633126788299178, |
|
"p99_intvty": 157.94624686255725, |
|
"mean_itl": 0.25001352380896114, |
|
"median_itl": 0.2541605730075389, |
|
"std_itl": 0.03360556282227612, |
|
"p99_itl": 0.29149442691821603, |
|
"mean_e2el": 5.303967301434568, |
|
"median_e2el": 5.298168066539802, |
|
"std_e2el": 0.5386612306643197, |
|
"p99_e2el": 6.40852697287919 |
|
}, |
|
{ |
|
"hw": "gb200", |
|
"conc": 4, |
|
"image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", |
|
"model": "/mnt/lustre01/models/deepseek-r1-0528-fp4-v2", |
|
"infmax_model_prefix": "dsr1", |
|
"framework": "dynamo-trt", |
|
"precision": "fp4", |
|
"spec_decoding": "mtp", |
|
"disagg": true, |
|
"isl": 1024, |
|
"osl": 1024, |
|
"is_multinode": true, |
|
"prefill_tp": 4, |
|
"prefill_ep": 4, |
|
"prefill_dp_attention": "false", |
|
"prefill_num_workers": 1, |
|
"decode_tp": 8, |
|
"decode_ep": 8, |
|
"decode_dp_attention": "false", |
|
"decode_num_workers": 4, |
|
"num_prefill_gpu": 4, |
|
"num_decode_gpu": 32, |
|
"tput_per_gpu": 49.02296004009678, |
|
"output_tput_per_gpu": 27.682864390336267, |
|
"input_tput_per_gpu": 219.74372523818087, |
|
"mean_ttft": 0.6219795793585945, |
|
"median_ttft": 0.1346952844178304, |
|
"std_ttft": 1.374741126592172, |
|
"p99_ttft": 4.935023965949658, |
|
"mean_tpot": 0.003636352923824148, |
|
"mean_intvty": 275.00081013818544, |
|
"median_tpot": 0.0036389672215150965, |
|
"median_intvty": 274.80324474691105, |
|
"std_tpot": 0.00020975560073827006, |
|
"std_intvty": 4767.453152527666, |
|
"p99_tpot": 0.004138613512598813, |
|
"p99_intvty": 241.6268146218024, |
|
"mean_itl": 0.1632820755978074, |
|
"median_itl": 0.16497785109095275, |
|
"std_itl": 0.03199877654797429, |
|
"p99_itl": 0.18763630567118517, |
|
"mean_e2el": 3.9947748274280457, |
|
"median_e2el": 3.519940600497648, |
|
"std_e2el": 1.409657149455134, |
|
"p99_e2el": 8.321354570840485 |
|
} |
|
] |