Skip to content

Instantly share code, notes, and snippets.

@tiandiao123
Created November 18, 2023 16:41
Show Gist options
  • Save tiandiao123/dd0ffccfbe0fcb45b77b772db8f20444 to your computer and use it in GitHub Desktop.
Save tiandiao123/dd0ffccfbe0fcb45b77b772db8f20444 to your computer and use it in GitHub Desktop.
from vllm import LLM, SamplingParams
import torch
from torch import distributed as dist
import time
from tqdm import tqdm
import numpy as np
# # Create an LLM.
llm = LLM(
model="/home/lclcq/share/llama-7b",
# model="/home/lclcq/share/models--bigscience--bloom-560m/snapshots/4f42c91d806a19ae1a46af6c3fb5f4990d884cd6",
# model="facebook/opt-125m",
tensor_parallel_size=1,
# max_num_seqs=1,
# max_num_batched_tokens=2048,
gpu_memory_utilization=0.95,
trust_remote_code=True)
def run_to_completion(sampling_params, dummy_prompt_token_ids, profile: bool = False):
if profile:
torch.cuda.cudart().cudaProfilerStart()
torch.cuda.synchronize()
start_time = time.time()
llm.generate(prompt_token_ids=dummy_prompt_token_ids,
sampling_params=sampling_params,
use_tqdm=False)
torch.cuda.synchronize()
end_time = time.time()
latency = end_time - start_time
if profile:
torch.cuda.cudart().cudaProfilerStop()
return latency
batch = 32
input_len = 1024
out_len = 128
sampling_params = SamplingParams(
n=1,
temperature=1.0,
top_p=1.0,
use_beam_search=False,
ignore_eos=True,
max_tokens=out_len,
)
dummy_prompt_token_ids = []
dummy_prompt_token_ids_s = torch.randint(1, 10240, (batch, input_len))
for t in range(batch):
a = []
for i in range(input_len):
a.append(i)
dummy_prompt_token_ids.append(a) # print(dummy_prompt_token_ids)
# print("Warming up...")
for i in range(2):
run_to_completion(sampling_params, dummy_prompt_token_ids, profile=False)
# Benchmark.
#latencies = []
#for _ in range(5): #tqdm(range(5), desc="Profiling iterations"):
# latencies.append(run_to_completion(sampling_params, dummy_prompt_token_ids, profile=False))
#prefill_avg_latency = np.mean(latencies)
#print(f'prefill latency: {prefill_avg_latency*1000 / out_len} ms')
# print(f'Avg throughput: {out_len/avg_latency} tokens/seconds')
#out_len = 128
#sampling_params = SamplingParams(
# n=1,
# temperature=1.0,
# top_p=1.0,
# use_beam_search=False,
# ignore_eos=True,
# max_tokens=out_len,
#)
dummy_prompt_token_ids = []
dummy_prompt_token_ids_s = torch.randint(1, 10240, (batch, input_len))
for t in range(batch):
a = []
for i in range(input_len):
a.append(i)
dummy_prompt_token_ids.append(a)
latencies = []
for _ in range(5): #tqdm(range(5), desc="Profiling iterations"):
latencies.append(run_to_completion(sampling_params, dummy_prompt_token_ids, profile=False))
avg_latency = np.mean(latencies)
# print(f'Avg latency: {avg_latency*1000 / out_len} ms')
# print(f'Decode throughput: {batch*out_len/(avg_latency - prefill_avg_latency)} tokens/s')
print(f'total throughput: {batch*out_len/avg_latency} tokens/s')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment