Skip to content

Instantly share code, notes, and snippets.

@xwjiang2010
Last active March 8, 2024 12:47
Show Gist options
  • Save xwjiang2010/1752fcf8fc3f0435a0dfb69a53adaa5b to your computer and use it in GitHub Desktop.
Save xwjiang2010/1752fcf8fc3f0435a0dfb69a53adaa5b to your computer and use it in GitHub Desktop.
A100
16
{"model_name": "llava-hf", "engine_args": {}, "benchmark_args": {"verbose": false, "backend": "VLLMChat", "results_filename": "metrics.jsonl", "port": 8000, "random_prompt_lens_mean": null, "random_prompt_lens_range": null, "distribution": "uniform", "qps": 16.0, "concurrency": 10000, "model": "llava-hf/llava-1.5-7b-hf", "warmup": false, "skip_wait_for_ready": true, "repeat": 1, "log_latencies": false, "fail_on_response_failure": false, "variable_response_lens_mean": null, "variable_response_lens_range": null, "variable_response_lens_distribution": "uniform", "num_requests": 1000, "prompts_filename": "./filtered-prompts.json", "gen_random_prompts": false, "allow_variable_generation_length": false, "fixed_max_tokens": 128, "print_generation_lens_and_exit": false, "name": ""}, "backend": "VLLMChat", "input_len": null, "output_len": null, "tp": "NA", "dur_s": 107.19906306266785, "tokens_per_s": 6543.2568154998535, "qps": 9.32843974033063, "successful_responses": 1000, "prompt_token_count": 597170, "response_token_count": 104261, "perf_metrics": [
{"perf_metric_name": "tokens_per_s", "perf_metric_value": 6543.2568154998535, "perf_metric_type": "THROUGHPUT"},
{"perf_metric_name": "qps", "perf_metric_value": 9.32843974033063, "perf_metric_type": "THROUGHPUT"},
{"perf_metric_name": "p50_token_latency", "perf_metric_value": 0.2734226007014513, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "p90_token_latency", "perf_metric_value": 0.37526187747716905, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "p99_token_latency", "perf_metric_value": 0.40012550486251713, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "p50_e2e_latency", "perf_metric_value": 34.99809288978577, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "p90_e2e_latency", "perf_metric_value": 48.03352031707764, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "p99_e2e_latency", "perf_metric_value": 51.21606462240219, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "std_token_latency", "perf_metric_value": 0.08899107278955123, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "std_e2e_latency", "perf_metric_value": 11.390857317062558, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "max_token_latency", "perf_metric_value": 0.40459249541163445, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "max_e2e_latency", "perf_metric_value": 51.78783941268921, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "min_token_latency", "perf_metric_value": 0.05021548829972744, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "min_e2e_latency", "perf_metric_value": 6.427582502365112, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "time_to_first_token", "perf_metric_value": 0.11026378534734249, "perf_metric_type": "LATENCY"}]}
12
{"model_name": "llava-hf", "engine_args": {}, "benchmark_args": {"verbose": false, "backend": "VLLMChat", "results_filename": "metrics.jsonl", "port": 8000, "random_prompt_lens_mean": null, "random_prompt_lens_range": null, "distribution": "uniform", "qps": 12.0, "concurrency": 10000, "model": "llava-hf/llava-1.5-7b-hf", "warmup": false, "skip_wait_for_ready": true, "repeat": 1, "log_latencies": false, "fail_on_response_failure": false, "variable_response_lens_mean": null, "variable_response_lens_range": null, "variable_response_lens_distribution": "uniform", "num_requests": 1000, "prompts_filename": "./filtered-prompts.json", "gen_random_prompts": false, "allow_variable_generation_length": false, "fixed_max_tokens": 128, "print_generation_lens_and_exit": false, "name": ""}, "backend": "VLLMChat", "input_len": null, "output_len": null, "tp": "NA", "dur_s": 107.6016116142273, "tokens_per_s": 6518.582663191805, "qps": 9.293541100343315, "successful_responses": 1000, "prompt_token_count": 597170, "response_token_count": 104240, "perf_metrics": [
{"perf_metric_name": "tokens_per_s", "perf_metric_value": 6518.582663191805, "perf_metric_type": "THROUGHPUT"},
{"perf_metric_name": "qps", "perf_metric_value": 9.293541100343315, "perf_metric_type": "THROUGHPUT"},
{"perf_metric_name": "p50_token_latency", "perf_metric_value": 0.19234111160039902, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "p90_token_latency", "perf_metric_value": 0.25062212646007537, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "p99_token_latency", "perf_metric_value": 0.2752254455350339, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "p50_e2e_latency", "perf_metric_value": 24.619662284851074, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "p90_e2e_latency", "perf_metric_value": 32.07963218688965, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "p99_e2e_latency", "perf_metric_value": 35.22885702848434, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "std_token_latency", "perf_metric_value": 0.04885991046343519, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "std_e2e_latency", "perf_metric_value": 6.254068539319705, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "max_token_latency", "perf_metric_value": 0.2798909731209278, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "max_e2e_latency", "perf_metric_value": 35.82604455947876, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "min_token_latency", "perf_metric_value": 0.03366171754896641, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "min_e2e_latency", "perf_metric_value": 4.3086998462677, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "time_to_first_token", "perf_metric_value": 0.03366171754896641, "perf_metric_type": "LATENCY"}]}
8
{"model_name": "llava-hf", "engine_args": {}, "benchmark_args": {"verbose": false, "backend": "VLLMChat", "results_filename": "metrics.jsonl", "port": 8000, "random_prompt_lens_mean": null, "random_prompt_lens_range": null, "distribution": "uniform", "qps": 8.0, "concurrency": 10000, "model": "llava-hf/llava-1.5-7b-hf", "warmup": false, "skip_wait_for_ready": true, "repeat": 1, "log_latencies": false, "fail_on_response_failure": false, "variable_response_lens_mean": null, "variable_response_lens_range": null, "variable_response_lens_distribution": "uniform", "num_requests": 1000, "prompts_filename": "./filtered-prompts.json", "gen_random_prompts": false, "allow_variable_generation_length": false, "fixed_max_tokens": 128, "print_generation_lens_and_exit": false, "name": ""}, "backend": "VLLMChat", "input_len": null, "output_len": null, "tp": "NA", "dur_s": 127.87555360794067, "tokens_per_s": 5483.057396175064, "qps": 7.820102996902319, "successful_responses": 1000, "prompt_token_count": 597170, "response_token_count": 103979, "perf_metrics": [
{"perf_metric_name": "tokens_per_s", "perf_metric_value": 5483.057396175064, "perf_metric_type": "THROUGHPUT"},
{"perf_metric_name": "qps", "perf_metric_value": 7.820102996902319, "perf_metric_type": "THROUGHPUT"},
{"perf_metric_name": "p50_token_latency", "perf_metric_value": 0.041319290176033974, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "p90_token_latency", "perf_metric_value": 0.0503193449229002, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "p99_token_latency", "perf_metric_value": 0.05487547442317009, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "p50_e2e_latency", "perf_metric_value": 5.288869142532349, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "p90_e2e_latency", "perf_metric_value": 6.440876150131226, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "p99_e2e_latency", "perf_metric_value": 7.0240607261657715, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "std_token_latency", "perf_metric_value": 0.00901270361699523, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "std_e2e_latency", "perf_metric_value": 1.1536260629753894, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "max_token_latency", "perf_metric_value": 0.05608443357050419, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "max_e2e_latency", "perf_metric_value": 7.178807497024536, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "min_token_latency", "perf_metric_value": 0.013619335368275642, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "min_e2e_latency", "perf_metric_value": 1.7432749271392822, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "time_to_first_token", "perf_metric_value": 0.014415653422474861, "perf_metric_type": "LATENCY"}]}
4
{"model_name": "llava-hf", "engine_args": {}, "benchmark_args": {"verbose": false, "backend": "VLLMChat", "results_filename": "metrics.jsonl", "port": 8000, "random_prompt_lens_mean": null, "random_prompt_lens_range": null, "distribution": "uniform", "qps": 4.0, "concurrency": 10000, "model": "llava-hf/llava-1.5-7b-hf", "warmup": false, "skip_wait_for_ready": true, "repeat": 1, "log_latencies": false, "fail_on_response_failure": false, "variable_response_lens_mean": null, "variable_response_lens_range": null, "variable_response_lens_distribution": "uniform", "num_requests": 1000, "prompts_filename": "./filtered-prompts.json", "gen_random_prompts": false, "allow_variable_generation_length": false, "fixed_max_tokens": 128, "print_generation_lens_and_exit": false, "name": ""}, "backend": "VLLMChat", "input_len": null, "output_len": null, "tp": "NA", "dur_s": 252.1038429737091, "tokens_per_s": 2781.052409713232, "qps": 3.9666194223952624, "successful_responses": 1000, "prompt_token_count": 597170, "response_token_count": 103944, "perf_metrics": [
{"perf_metric_name": "tokens_per_s", "perf_metric_value": 2781.052409713232, "perf_metric_type": "THROUGHPUT"},
{"perf_metric_name": "qps", "perf_metric_value": 3.9666194223952624, "perf_metric_type": "THROUGHPUT"},
{"perf_metric_name": "p50_token_latency", "perf_metric_value": 0.015399465337395668, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "p90_token_latency", "perf_metric_value": 0.01858694963157177, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "p99_token_latency", "perf_metric_value": 0.018932330254465342, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "p50_e2e_latency", "perf_metric_value": 1.9711315631866455, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "p90_e2e_latency", "perf_metric_value": 2.3791295528411864, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "p99_e2e_latency", "perf_metric_value": 2.4233382725715638, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "std_token_latency", "perf_metric_value": 0.003053107942153016, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "std_e2e_latency", "perf_metric_value": 0.39079781659558605, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "max_token_latency", "perf_metric_value": 0.02067841775715351, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "max_e2e_latency", "perf_metric_value": 2.6468374729156494, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "min_token_latency", "perf_metric_value": 0.008439883589744568, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "min_e2e_latency", "perf_metric_value": 1.0803050994873047, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "time_to_first_token", "perf_metric_value": 0.009089522063732147, "perf_metric_type": "LATENCY"}]}
6
{"model_name": "llava-hf", "engine_args": {}, "benchmark_args": {"verbose": false, "backend": "VLLMChat", "results_filename": "metrics.jsonl", "port": 8000, "random_prompt_lens_mean": null, "random_prompt_lens_range": null, "distribution": "uniform", "qps": 6.0, "concurrency": 10000, "model": "llava-hf/llava-1.5-7b-hf", "warmup": false, "skip_wait_for_ready": true, "repeat": 1, "log_latencies": false, "fail_on_response_failure": false, "variable_response_lens_mean": null, "variable_response_lens_range": null, "variable_response_lens_distribution": "uniform", "num_requests": 1000, "prompts_filename": "./filtered-prompts.json", "gen_random_prompts": false, "allow_variable_generation_length": false, "fixed_max_tokens": 128, "print_generation_lens_and_exit": false, "name": ""}, "backend": "VLLMChat", "input_len": null, "output_len": null, "tp": "NA", "dur_s": 168.98763060569763, "tokens_per_s": 4149.019650059289, "qps": 5.91759288189158, "successful_responses": 1000, "prompt_token_count": 597170, "response_token_count": 103963, "perf_metrics": [
{"perf_metric_name": "tokens_per_s", "perf_metric_value": 4149.019650059289, "perf_metric_type": "THROUGHPUT"},
{"perf_metric_name": "qps", "perf_metric_value": 5.91759288189158, "perf_metric_type": "THROUGHPUT"},
{"perf_metric_name": "p50_token_latency", "perf_metric_value": 0.022137231193482876, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "p90_token_latency", "perf_metric_value": 0.026809886284172534, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "p99_token_latency", "perf_metric_value": 0.028676126226782797, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "p50_e2e_latency", "perf_metric_value": 2.833565592765808, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "p90_e2e_latency", "perf_metric_value": 3.4316654443740844, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "p99_e2e_latency", "perf_metric_value": 3.670544157028198, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "std_token_latency", "perf_metric_value": 0.0044735476994932225, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "std_e2e_latency", "perf_metric_value": 0.5726141055351325, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "max_token_latency", "perf_metric_value": 0.030819762498140335, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "max_e2e_latency", "perf_metric_value": 3.944929599761963, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "min_token_latency", "perf_metric_value": 0.011092148721218109, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "min_e2e_latency", "perf_metric_value": 1.419795036315918, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "time_to_first_token", "perf_metric_value": 0.011092148721218109, "perf_metric_type": "LATENCY"}]}
10
{"model_name": "llava-hf", "engine_args": {}, "benchmark_args": {"verbose": false, "backend": "VLLMChat", "results_filename": "metrics.jsonl", "port": 8000, "random_prompt_lens_mean": null, "random_prompt_lens_range": null, "distribution": "uniform", "qps": 10.0, "concurrency": 10000, "model": "llava-hf/llava-1.5-7b-hf", "warmup": false, "skip_wait_for_ready": true, "repeat": 1, "log_latencies": false, "fail_on_response_failure": false, "variable_response_lens_mean": null, "variable_response_lens_range": null, "variable_response_lens_distribution": "uniform", "num_requests": 1000, "prompts_filename": "./filtered-prompts.json", "gen_random_prompts": false, "allow_variable_generation_length": false, "fixed_max_tokens": 128, "print_generation_lens_and_exit": false, "name": ""}, "backend": "VLLMChat", "input_len": null, "output_len": null, "tp": "NA", "dur_s": 108.17407441139221, "tokens_per_s": 6486.027301991962, "qps": 9.24435920012537, "successful_responses": 1000, "prompt_token_count": 597170, "response_token_count": 104450, "perf_metrics": [
{"perf_metric_name": "tokens_per_s", "perf_metric_value": 6486.027301991962, "perf_metric_type": "THROUGHPUT"},
{"perf_metric_name": "qps", "perf_metric_value": 9.24435920012537, "perf_metric_type": "THROUGHPUT"},
{"perf_metric_name": "p50_token_latency", "perf_metric_value": 0.12879762705415487, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "p90_token_latency", "perf_metric_value": 0.16809469517320394, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "p99_token_latency", "perf_metric_value": 0.17838837679475547, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "p50_e2e_latency", "perf_metric_value": 16.486096262931824, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "p90_e2e_latency", "perf_metric_value": 21.516120982170104, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "p99_e2e_latency", "perf_metric_value": 22.8337122297287, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "std_token_latency", "perf_metric_value": 0.034685117463125, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "std_e2e_latency", "perf_metric_value": 4.43969503528, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "max_token_latency", "perf_metric_value": 0.18270833045244217, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "max_e2e_latency", "perf_metric_value": 23.386666297912598, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "min_token_latency", "perf_metric_value": 0.01959644816815853, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "min_e2e_latency", "perf_metric_value": 2.508345365524292, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "time_to_first_token", "perf_metric_value": 0.01959644816815853, "perf_metric_type": "LATENCY"}]}
llama
16
{"model_name": "meta-llama", "engine_args": {}, "benchmark_args": {"verbose": false, "backend": "VLLMChat", "results_filename": "metrics.jsonl", "port": 8000, "random_prompt_lens_mean": null, "random_prompt_lens_range": null, "distribution": "uniform", "qps": 16.0, "concurrency": 10000, "model": "meta-llama/Llama-2-7b-chat-hf", "warmup": false, "skip_wait_for_ready": true, "repeat": 1, "log_latencies": false, "fail_on_response_failure": false, "variable_response_lens_mean": null, "variable_response_lens_range": null, "variable_response_lens_distribution": "uniform", "num_requests": 1000, "prompts_filename": "./filtered-prompts_text.json", "gen_random_prompts": false, "allow_variable_generation_length": false, "fixed_max_tokens": 128, "print_generation_lens_and_exit": false, "name": ""}, "backend": "VLLMChat", "input_len": null, "output_len": null, "tp": "NA", "dur_s": 107.38329553604126, "tokens_per_s": 6403.81724706147, "qps": 9.312435374684213, "successful_responses": 1000, "prompt_token_count": 559663, "response_token_count": 128000, "perf_metrics": [
{"perf_metric_name": "tokens_per_s", "perf_metric_value": 6403.81724706147, "perf_metric_type": "THROUGHPUT"},
{"perf_metric_name": "qps", "perf_metric_value": 9.312435374684213, "perf_metric_type": "THROUGHPUT"},
{"perf_metric_name": "p50_token_latency", "perf_metric_value": 0.2722818097099662, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "p90_token_latency", "perf_metric_value": 0.3738405771553516, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "p99_token_latency", "perf_metric_value": 0.38317211927846073, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "p50_e2e_latency", "perf_metric_value": 34.85207164287567, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "p90_e2e_latency", "perf_metric_value": 47.85159387588501, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "p99_e2e_latency", "perf_metric_value": 49.046031267642974, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "std_token_latency", "perf_metric_value": 0.07768880395801925, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "std_e2e_latency", "perf_metric_value": 9.944166906626464, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "max_token_latency", "perf_metric_value": 0.3861427418887615, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "max_e2e_latency", "perf_metric_value": 49.426270961761475, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "min_token_latency", "perf_metric_value": 0.15147855691611767, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "min_e2e_latency", "perf_metric_value": 19.38925528526306, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "time_to_first_token", "perf_metric_value": 0.1551942154765129, "perf_metric_type": "LATENCY"}]}
12
{"model_name": "meta-llama", "engine_args": {}, "benchmark_args": {"verbose": false, "backend": "VLLMChat", "results_filename": "metrics.jsonl", "port": 8000, "random_prompt_lens_mean": null, "random_prompt_lens_range": null, "distribution": "uniform", "qps": 12.0, "concurrency": 10000, "model": "meta-llama/Llama-2-7b-chat-hf", "warmup": false, "skip_wait_for_ready": true, "repeat": 1, "log_latencies": false, "fail_on_response_failure": false, "variable_response_lens_mean": null, "variable_response_lens_range": null, "variable_response_lens_distribution": "uniform", "num_requests": 1000, "prompts_filename": "./filtered-prompts_text.json", "gen_random_prompts": false, "allow_variable_generation_length": false, "fixed_max_tokens": 128, "print_generation_lens_and_exit": false, "name": ""}, "backend": "VLLMChat", "input_len": null, "output_len": null, "tp": "NA", "dur_s": 108.3005964756012, "tokens_per_s": 6349.577217286353, "qps": 9.233559486676398, "successful_responses": 1000, "prompt_token_count": 559663, "response_token_count": 128000, "perf_metrics": [
{"perf_metric_name": "tokens_per_s", "perf_metric_value": 6349.577217286353, "perf_metric_type": "THROUGHPUT"},
{"perf_metric_name": "qps", "perf_metric_value": 9.233559486676398, "perf_metric_type": "THROUGHPUT"},
{"perf_metric_name": "p50_token_latency", "perf_metric_value": 0.19406349584460258, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "p90_token_latency", "perf_metric_value": 0.23629574347287416, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "p99_token_latency", "perf_metric_value": 0.2505883136764169, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "p50_e2e_latency", "perf_metric_value": 24.84012746810913, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "p90_e2e_latency", "perf_metric_value": 30.245855164527892, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "p99_e2e_latency", "perf_metric_value": 32.07530415058136, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "std_token_latency", "perf_metric_value": 0.033255816948290454, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "std_e2e_latency", "perf_metric_value": 4.256744569381178, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "max_token_latency", "perf_metric_value": 0.2517655659466982, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "max_e2e_latency", "perf_metric_value": 32.22599244117737, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "min_token_latency", "perf_metric_value": 0.10007438994944096, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "min_e2e_latency", "perf_metric_value": 12.809521913528442, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "time_to_first_token", "perf_metric_value": 0.10007438994944096, "perf_metric_type": "LATENCY"}]}
8
{"model_name": "meta-llama", "engine_args": {}, "benchmark_args": {"verbose": false, "backend": "VLLMChat", "results_filename": "metrics.jsonl", "port": 8000, "random_prompt_lens_mean": null, "random_prompt_lens_range": null, "distribution": "uniform", "qps": 8.0, "concurrency": 10000, "model": "meta-llama/Llama-2-7b-chat-hf", "warmup": false, "skip_wait_for_ready": true, "repeat": 1, "log_latencies": false, "fail_on_response_failure": false, "variable_response_lens_mean": null, "variable_response_lens_range": null, "variable_response_lens_distribution": "uniform", "num_requests": 1000, "prompts_filename": "./filtered-prompts_text.json", "gen_random_prompts": false, "allow_variable_generation_length": false, "fixed_max_tokens": 128, "print_generation_lens_and_exit": false, "name": ""}, "backend": "VLLMChat", "input_len": null, "output_len": null, "tp": "NA", "dur_s": 127.94017839431763, "tokens_per_s": 5374.879171112224, "qps": 7.816152928268969, "successful_responses": 1000, "prompt_token_count": 559663, "response_token_count": 128000, "perf_metrics": [
{"perf_metric_name": "tokens_per_s", "perf_metric_value": 5374.879171112224, "perf_metric_type": "THROUGHPUT"},
{"perf_metric_name": "qps", "perf_metric_value": 7.816152928268969, "perf_metric_type": "THROUGHPUT"},
{"perf_metric_name": "p50_token_latency", "perf_metric_value": 0.03611143212765455, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "p90_token_latency", "perf_metric_value": 0.03912978358566761, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "p99_token_latency", "perf_metric_value": 0.043118344340473416, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "p50_e2e_latency", "perf_metric_value": 4.622263312339783, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "p90_e2e_latency", "perf_metric_value": 5.0086122989654545, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "p99_e2e_latency", "perf_metric_value": 5.519148075580597, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "std_token_latency", "perf_metric_value": 0.0027364624303395386, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "std_e2e_latency", "perf_metric_value": 0.35026719108346094, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "max_token_latency", "perf_metric_value": 0.04404551908373833, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "max_e2e_latency", "perf_metric_value": 5.637826442718506, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "min_token_latency", "perf_metric_value": 0.017856869846582413, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "min_e2e_latency", "perf_metric_value": 2.285679340362549, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "time_to_first_token", "perf_metric_value": 0.027424929663538933, "perf_metric_type": "LATENCY"}]}
4
{"model_name": "meta-llama", "engine_args": {}, "benchmark_args": {"verbose": false, "backend": "VLLMChat", "results_filename": "metrics.jsonl", "port": 8000, "random_prompt_lens_mean": null, "random_prompt_lens_range": null, "distribution": "uniform", "qps": 4.0, "concurrency": 10000, "model": "meta-llama/Llama-2-7b-chat-hf", "warmup": false, "skip_wait_for_ready": true, "repeat": 1, "log_latencies": false, "fail_on_response_failure": false, "variable_response_lens_mean": null, "variable_response_lens_range": null, "variable_response_lens_distribution": "uniform", "num_requests": 1000, "prompts_filename": "./filtered-prompts_text.json", "gen_random_prompts": false, "allow_variable_generation_length": false, "fixed_max_tokens": 128, "print_generation_lens_and_exit": false, "name": ""}, "backend": "VLLMChat", "input_len": null, "output_len": null, "tp": "NA", "dur_s": 252.27554035186768, "tokens_per_s": 2725.8409556505744, "qps": 3.9639197625153226, "successful_responses": 1000, "prompt_token_count": 559663, "response_token_count": 128000, "perf_metrics": [
{"perf_metric_name": "tokens_per_s", "perf_metric_value": 2725.8409556505744, "perf_metric_type": "THROUGHPUT"},
{"perf_metric_name": "qps", "perf_metric_value": 3.9639197625153226, "perf_metric_type": "THROUGHPUT"},
{"perf_metric_name": "p50_token_latency", "perf_metric_value": 0.01684090681374073, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "p90_token_latency", "perf_metric_value": 0.017051630094647406, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "p99_token_latency", "perf_metric_value": 0.0172210156545043, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "p50_e2e_latency", "perf_metric_value": 2.1556360721588135, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "p90_e2e_latency", "perf_metric_value": 2.182608652114868, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "p99_e2e_latency", "perf_metric_value": 2.2042900037765505, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "std_token_latency", "perf_metric_value": 0.0002677782321113971, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "std_e2e_latency", "perf_metric_value": 0.03427561371025883, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "max_token_latency", "perf_metric_value": 0.017321225255727768, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "max_e2e_latency", "perf_metric_value": 2.2171168327331543, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "min_token_latency", "perf_metric_value": 0.01299804262816906, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "min_e2e_latency", "perf_metric_value": 1.6637494564056396, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "time_to_first_token", "perf_metric_value": 0.015207679942250252, "perf_metric_type": "LATENCY"}]}
10
{"model_name": "meta-llama", "engine_args": {}, "benchmark_args": {"verbose": false, "backend": "VLLMChat", "results_filename": "metrics.jsonl", "port": 8000, "random_prompt_lens_mean": null, "random_prompt_lens_range": null, "distribution": "uniform", "qps": 10.0, "concurrency": 10000, "model": "meta-llama/Llama-2-7b-chat-hf", "warmup": false, "skip_wait_for_ready": true, "repeat": 1, "log_latencies": false, "fail_on_response_failure": false, "variable_response_lens_mean": null, "variable_response_lens_range": null, "variable_response_lens_distribution": "uniform", "num_requests": 1000, "prompts_filename": "./filtered-prompts_text.json", "gen_random_prompts": false, "allow_variable_generation_length": false, "fixed_max_tokens": 128, "print_generation_lens_and_exit": false, "name": ""}, "backend": "VLLMChat", "input_len": null, "output_len": null, "tp": "NA", "dur_s": 109.31059122085571, "tokens_per_s": 6290.909163693175, "qps": 9.148244363435543, "successful_responses": 1000, "prompt_token_count": 559663, "response_token_count": 128000, "perf_metrics": [
{"perf_metric_name": "tokens_per_s", "perf_metric_value": 6290.909163693175, "perf_metric_type": "THROUGHPUT"},
{"perf_metric_name": "qps", "perf_metric_value": 9.148244363435543, "perf_metric_type": "THROUGHPUT"},
{"perf_metric_name": "p50_token_latency", "perf_metric_value": 0.14209844917058945, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "p90_token_latency", "perf_metric_value": 0.14465151838958262, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "p99_token_latency", "perf_metric_value": 0.14629587290808557, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "p50_e2e_latency", "perf_metric_value": 18.18860149383545, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "p90_e2e_latency", "perf_metric_value": 18.515394353866576, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "p99_e2e_latency", "perf_metric_value": 18.725871732234953, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "std_token_latency", "perf_metric_value": 0.021605273683310278, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "std_e2e_latency", "perf_metric_value": 2.7654750314637155, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "max_token_latency", "perf_metric_value": 0.1469615399837494, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "max_e2e_latency", "perf_metric_value": 18.811077117919922, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "min_token_latency", "perf_metric_value": 0.046129707247018814, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "min_e2e_latency", "perf_metric_value": 5.904602527618408, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "time_to_first_token", "perf_metric_value": 0.046129707247018814, "perf_metric_type": "LATENCY"}]}
6
{"model_name": "meta-llama", "engine_args": {}, "benchmark_args": {"verbose": false, "backend": "VLLMChat", "results_filename": "metrics.jsonl", "port": 8000, "random_prompt_lens_mean": null, "random_prompt_lens_range": null, "distribution": "uniform", "qps": 6.0, "concurrency": 10000, "model": "meta-llama/Llama-2-7b-chat-hf", "warmup": false, "skip_wait_for_ready": true, "repeat": 1, "log_latencies": false, "fail_on_response_failure": false, "variable_response_lens_mean": null, "variable_response_lens_range": null, "variable_response_lens_distribution": "uniform", "num_requests": 1000, "prompts_filename": "./filtered-prompts_text.json", "gen_random_prompts": false, "allow_variable_generation_length": false, "fixed_max_tokens": 128, "print_generation_lens_and_exit": false, "name": ""}, "backend": "VLLMChat", "input_len": null, "output_len": null, "tp": "NA", "dur_s": 169.16680216789246, "tokens_per_s": 4064.999699630884, "qps": 5.911325314334032, "successful_responses": 1000, "prompt_token_count": 559663, "response_token_count": 128000, "perf_metrics": [
{"perf_metric_name": "tokens_per_s", "perf_metric_value": 4064.999699630884, "perf_metric_type": "THROUGHPUT"},
{"perf_metric_name": "qps", "perf_metric_value": 5.911325314334032, "perf_metric_type": "THROUGHPUT"},
{"perf_metric_name": "p50_token_latency", "perf_metric_value": 0.02305004559457302, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "p90_token_latency", "perf_metric_value": 0.02351635806262493, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "p99_token_latency", "perf_metric_value": 0.025685259103775025, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "p50_e2e_latency", "perf_metric_value": 2.9504058361053467, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "p90_e2e_latency", "perf_metric_value": 3.010093832015991, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "p99_e2e_latency", "perf_metric_value": 3.2877131652832032, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "std_token_latency", "perf_metric_value": 0.000961756010803708, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "std_e2e_latency", "perf_metric_value": 0.12310476938287462, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "max_token_latency", "perf_metric_value": 0.025977708399295807, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "max_e2e_latency", "perf_metric_value": 3.3251466751098633, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "min_token_latency", "perf_metric_value": 0.014674795791506767, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "min_e2e_latency", "perf_metric_value": 1.8783738613128662, "perf_metric_type": "LATENCY"},
{"perf_metric_name": "time_to_first_token", "perf_metric_value": 0.018100978806614876, "perf_metric_type": "LATENCY"}]}
A10
llava 4
{"model_name": "llava-hf", "engine_args": {}, "benchmark_args": {"verbose": false, "backend": "VLLMChat", "results_filename": "metrics.jsonl", "port": 8000, "random_prompt_lens_mean": null, "random_prompt_lens_range": null, "distribution": "uniform", "qps": 4.0, "concurrency": 10000, "model": "llava-hf/llava-1.5-7b-hf", "warmup": false, "skip_wait_for_ready": true, "repeat": 1, "log_latencies": false, "fail_on_response_failure": false, "variable_response_lens_mean": null, "variable_response_lens_range": null, "variable_response_lens_distribution": "uniform", "num_requests": 1000, "prompts_filename": "filtered-prompts.json", "gen_random_prompts": false, "images_path": "/mnt/local_storage/pixel_values", "allow_variable_generation_length": false, "fixed_max_tokens": 128, "print_generation_lens_and_exit": false, "name": ""}, "backend": "VLLMChat", "input_len": null, "output_len": null, "tp": "NA", "dur_s": 485.38395524024963, "tokens_per_s": 1443.6983184851094, "qps": 2.0602246720434585, "successful_responses": 1000, "prompt_token_count": 597200, "response_token_count": 103548, "perf_metrics": [{"perf_metric_name": "tokens_per_s", "perf_metric_value": 1443.6983184851094, "perf_metric_type": "THROUGHPUT"}, {"perf_metric_name": "qps", "perf_metric_value": 2.0602246720434585, "perf_metric_type": "THROUGHPUT"}, {"perf_metric_name": "p50_token_latency", "perf_metric_value": 0.9684563595801592, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "p90_token_latency", "perf_metric_value": 1.6892448334023358, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "p99_token_latency", "perf_metric_value": 1.8362966247089207, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "p50_e2e_latency", "perf_metric_value": 123.96241402626038, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "p90_e2e_latency", "perf_metric_value": 216.22333867549898, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "p99_e2e_latency", "perf_metric_value": 235.04596796274186, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "std_token_latency", "perf_metric_value": 0.5262933197032936, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "std_e2e_latency", "perf_metric_value": 67.36554492202158, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "max_token_latency", "perf_metric_value": 1.8464868124574423, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "max_e2e_latency", "perf_metric_value": 236.3503119945526, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "min_token_latency", "perf_metric_value": 0.04214267246425152, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "min_e2e_latency", "perf_metric_value": 5.394262075424194, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "time_to_first_token", "perf_metric_value": 0.05201468616724014, "perf_metric_type": "LATENCY"}]}
llava 2
{"model_name": "llava-hf", "engine_args": {}, "benchmark_args": {"verbose": false, "backend": "VLLMChat", "results_filename": "metrics.jsonl", "port": 8000, "random_prompt_lens_mean": null, "random_prompt_lens_range": null, "distribution": "uniform", "qps": 2.0, "concurrency": 10000, "model": "llava-hf/llava-1.5-7b-hf", "warmup": false, "skip_wait_for_ready": true, "repeat": 1, "log_latencies": false, "fail_on_response_failure": false, "variable_response_lens_mean": null, "variable_response_lens_range": null, "variable_response_lens_distribution": "uniform", "num_requests": 1000, "prompts_filename": "filtered-prompts.json", "gen_random_prompts": false, "images_path": "/mnt/local_storage/pixel_values", "allow_variable_generation_length": false, "fixed_max_tokens": 128, "print_generation_lens_and_exit": false, "name": ""}, "backend": "VLLMChat", "input_len": null, "output_len": null, "tp": "NA", "dur_s": 505.2397577762604, "tokens_per_s": 1387.1810149793623, "qps": 1.9792583315322514, "successful_responses": 1000, "prompt_token_count": 597200, "response_token_count": 103659, "perf_metrics": [{"perf_metric_name": "tokens_per_s", "perf_metric_value": 1387.1810149793623, "perf_metric_type": "THROUGHPUT"}, {"perf_metric_name": "qps", "perf_metric_value": 1.9792583315322514, "perf_metric_type": "THROUGHPUT"}, {"perf_metric_name": "p50_token_latency", "perf_metric_value": 0.0570235475897789, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "p90_token_latency", "perf_metric_value": 0.06673365142196416, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "p99_token_latency", "perf_metric_value": 0.06943130027502775, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "p50_e2e_latency", "perf_metric_value": 7.299014091491699, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "p90_e2e_latency", "perf_metric_value": 8.541907382011413, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "p99_e2e_latency", "perf_metric_value": 8.887206435203552, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "std_token_latency", "perf_metric_value": 0.01084388019864346, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "std_e2e_latency", "perf_metric_value": 1.3880166654263628, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "max_token_latency", "perf_metric_value": 0.07008214667439461, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "max_e2e_latency", "perf_metric_value": 8.97051477432251, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "min_token_latency", "perf_metric_value": 0.03003861755132675, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "min_e2e_latency", "perf_metric_value": 3.844943046569824, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "time_to_first_token", "perf_metric_value": 0.03003861755132675, "perf_metric_type": "LATENCY"}]}
llava 1
{"model_name": "llava-hf", "engine_args": {}, "benchmark_args": {"verbose": false, "backend": "VLLMChat", "results_filename": "metrics.jsonl", "port": 8000, "random_prompt_lens_mean": null, "random_prompt_lens_range": null, "distribution": "uniform", "qps": 1.0, "concurrency": 10000, "model": "llava-hf/llava-1.5-7b-hf", "warmup": false, "skip_wait_for_ready": true, "repeat": 1, "log_latencies": false, "fail_on_response_failure": false, "variable_response_lens_mean": null, "variable_response_lens_range": null, "variable_response_lens_distribution": "uniform", "num_requests": 1000, "prompts_filename": "filtered-prompts.json", "gen_random_prompts": false, "images_path": "/mnt/local_storage/pixel_values", "allow_variable_generation_length": false, "fixed_max_tokens": 128, "print_generation_lens_and_exit": false, "name": ""}, "backend": "VLLMChat", "input_len": null, "output_len": null, "tp": "NA", "dur_s": 1003.8169505596161, "tokens_per_s": 698.2707351267942, "qps": 0.9961975631538318, "successful_responses": 1000, "prompt_token_count": 597200, "response_token_count": 103736, "perf_metrics": [{"perf_metric_name": "tokens_per_s", "perf_metric_value": 698.2707351267942, "perf_metric_type": "THROUGHPUT"}, {"perf_metric_name": "qps", "perf_metric_value": 0.9961975631538318, "perf_metric_type": "THROUGHPUT"}, {"perf_metric_name": "p50_token_latency", "perf_metric_value": 0.03700142353773117, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "p90_token_latency", "perf_metric_value": 0.04382942002266645, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "p99_token_latency", "perf_metric_value": 0.0442644190788269, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "p50_e2e_latency", "perf_metric_value": 4.73618221282959, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "p90_e2e_latency", "perf_metric_value": 5.610165762901306, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "p99_e2e_latency", "perf_metric_value": 5.6658456420898435, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "std_token_latency", "perf_metric_value": 0.007194200545318932, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "std_e2e_latency", "perf_metric_value": 0.9208576698008233, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "max_token_latency", "perf_metric_value": 0.04442277178168297, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "max_e2e_latency", "perf_metric_value": 5.68611478805542, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "min_token_latency", "perf_metric_value": 0.01992327906191349, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "min_e2e_latency", "perf_metric_value": 2.5501797199249268, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "time_to_first_token", "perf_metric_value": 0.03598182462155819, "perf_metric_type": "LATENCY"}]}
llama 4
{"model_name": "meta-llama", "engine_args": {}, "benchmark_args": {"verbose": false, "backend": "VLLMChat", "results_filename": "metrics.jsonl", "port": 8000, "random_prompt_lens_mean": null, "random_prompt_lens_range": null, "distribution": "uniform", "qps": 4.0, "concurrency": 10000, "model": "meta-llama/Llama-2-7b-chat-hf", "warmup": false, "skip_wait_for_ready": true, "repeat": 1, "log_latencies": false, "fail_on_response_failure": false, "variable_response_lens_mean": null, "variable_response_lens_range": null, "variable_response_lens_distribution": "uniform", "num_requests": 1000, "prompts_filename": "./filtered-prompts_text.json", "gen_random_prompts": false, "images_path": null, "allow_variable_generation_length": false, "fixed_max_tokens": 128, "print_generation_lens_and_exit": false, "name": ""}, "backend": "VLLMChat", "input_len": null, "output_len": null, "tp": "NA", "dur_s": 573.5994899272919, "tokens_per_s": 1198.8556685905814, "qps": 1.7433767246319511, "successful_responses": 1000, "prompt_token_count": 559663, "response_token_count": 128000, "perf_metrics": [{"perf_metric_name": "tokens_per_s", "perf_metric_value": 1198.8556685905814, "perf_metric_type": "THROUGHPUT"}, {"perf_metric_name": "qps", "perf_metric_value": 1.7433767246319511, "perf_metric_type": "THROUGHPUT"}, {"perf_metric_name": "p50_token_latency", "perf_metric_value": 1.3127137478441, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "p90_token_latency", "perf_metric_value": 2.3271125741302967, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "p99_token_latency", "perf_metric_value": 2.5279285298660397, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "p50_e2e_latency", "perf_metric_value": 168.0273597240448, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "p90_e2e_latency", "perf_metric_value": 297.870409488678, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "p99_e2e_latency", "perf_metric_value": 323.5748518228531, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "std_token_latency", "perf_metric_value": 0.7374774699708819, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "std_e2e_latency", "perf_metric_value": 94.39711615627289, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "max_token_latency", "perf_metric_value": 2.5435944478958845, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "max_e2e_latency", "perf_metric_value": 325.5800893306732, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "min_token_latency", "perf_metric_value": 0.057532334700226784, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "min_e2e_latency", "perf_metric_value": 7.364138841629028, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "time_to_first_token", "perf_metric_value": 0.057532334700226784, "perf_metric_type": "LATENCY"}]}
llama 2
{"model_name": "meta-llama", "engine_args": {}, "benchmark_args": {"verbose": false, "backend": "VLLMChat", "results_filename": "metrics.jsonl", "port": 8000, "random_prompt_lens_mean": null, "random_prompt_lens_range": null, "distribution": "uniform", "qps": 2.0, "concurrency": 10000, "model": "meta-llama/Llama-2-7b-chat-hf", "warmup": false, "skip_wait_for_ready": true, "repeat": 1, "log_latencies": false, "fail_on_response_failure": false, "variable_response_lens_mean": null, "variable_response_lens_range": null, "variable_response_lens_distribution": "uniform", "num_requests": 1000, "prompts_filename": "./filtered-prompts_text.json", "gen_random_prompts": false, "images_path": null, "allow_variable_generation_length": false, "fixed_max_tokens": 128, "print_generation_lens_and_exit": false, "name": ""}, "backend": "VLLMChat", "input_len": null, "output_len": null, "tp": "NA", "dur_s": 560.4678485393524, "tokens_per_s": 1226.9445995736128, "qps": 1.7842236670776424, "successful_responses": 1000, "prompt_token_count": 559663, "response_token_count": 128000, "perf_metrics": [{"perf_metric_name": "tokens_per_s", "perf_metric_value": 1226.9445995736128, "perf_metric_type": "THROUGHPUT"}, {"perf_metric_name": "qps", "perf_metric_value": 1.7842236670776424, "perf_metric_type": "THROUGHPUT"}, {"perf_metric_name": "p50_token_latency", "perf_metric_value": 0.27004454750567675, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "p90_token_latency", "perf_metric_value": 0.45103663466870786, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "p99_token_latency", "perf_metric_value": 0.4932441928051412, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "p50_e2e_latency", "perf_metric_value": 34.56570208072662, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "p90_e2e_latency", "perf_metric_value": 57.732689237594606, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "p99_e2e_latency", "perf_metric_value": 63.135256679058074, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "std_token_latency", "perf_metric_value": 0.1288162593920807, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "std_e2e_latency", "perf_metric_value": 16.48848120218633, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "max_token_latency", "perf_metric_value": 0.5058726519346237, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "max_e2e_latency", "perf_metric_value": 64.75169944763184, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "min_token_latency", "perf_metric_value": 0.05014154687523842, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "min_e2e_latency", "perf_metric_value": 6.418118000030518, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "time_to_first_token", "perf_metric_value": 0.05014154687523842, "perf_metric_type": "LATENCY"}]}
llama 1
{"model_name": "meta-llama", "engine_args": {}, "benchmark_args": {"verbose": false, "backend": "VLLMChat", "results_filename": "metrics.jsonl", "port": 8000, "random_prompt_lens_mean": null, "random_prompt_lens_range": null, "distribution": "uniform", "qps": 1.0, "concurrency": 10000, "model": "meta-llama/Llama-2-7b-chat-hf", "warmup": false, "skip_wait_for_ready": true, "repeat": 1, "log_latencies": false, "fail_on_response_failure": false, "variable_response_lens_mean": null, "variable_response_lens_range": null, "variable_response_lens_distribution": "uniform", "num_requests": 1000, "prompts_filename": "./filtered-prompts_text.json", "gen_random_prompts": false, "images_path": null, "allow_variable_generation_length": false, "fixed_max_tokens": 128, "print_generation_lens_and_exit": false, "name": ""}, "backend": "VLLMChat", "input_len": null, "output_len": null, "tp": "NA", "dur_s": 1004.7631087303162, "tokens_per_s": 684.4031135547717, "qps": 0.9952594709251068, "successful_responses": 1000, "prompt_token_count": 559663, "response_token_count": 128000, "perf_metrics": [{"perf_metric_name": "tokens_per_s", "perf_metric_value": 684.4031135547717, "perf_metric_type": "THROUGHPUT"}, {"perf_metric_name": "qps", "perf_metric_value": 0.9952594709251068, "perf_metric_type": "THROUGHPUT"}, {"perf_metric_name": "p50_token_latency", "perf_metric_value": 0.04272072482854128, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "p90_token_latency", "perf_metric_value": 0.04336111098527908, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "p99_token_latency", "perf_metric_value": 0.04386281063780188, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "p50_e2e_latency", "perf_metric_value": 5.468252778053284, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "p90_e2e_latency", "perf_metric_value": 5.5502222061157225, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "p99_e2e_latency", "perf_metric_value": 5.614439761638641, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "std_token_latency", "perf_metric_value": 0.0005604126560521587, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "std_e2e_latency", "perf_metric_value": 0.07173281997467632, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "max_token_latency", "perf_metric_value": 0.044527338817715645, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "max_e2e_latency", "perf_metric_value": 5.6994993686676025, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "min_token_latency", "perf_metric_value": 0.03547539748251438, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "min_e2e_latency", "perf_metric_value": 4.540850877761841, "perf_metric_type": "LATENCY"}, {"perf_metric_name": "time_to_first_token", "perf_metric_value": 0.040352869778871536, "perf_metric_type": "LATENCY"}]}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment