Skip to content

Instantly share code, notes, and snippets.

@tiandiao123
Created November 19, 2023 05:29
Show Gist options
  • Save tiandiao123/d5a30697864376fa5943fd8e953ae34a to your computer and use it in GitHub Desktop.
Save tiandiao123/d5a30697864376fa5943fd8e953ae34a to your computer and use it in GitHub Desktop.
from vllm import LLM, SamplingParams
import torch
from torch import distributed as dist
import time
from tqdm import tqdm
import numpy as np
# # Create an LLM.
llm = LLM(
# model="/home/lclcq/share/llama-7b",
model="/home/lclhx/share/Colossal-LLaMA-2-7b-sft",
# model="/home/lclcq/share/models--bigscience--bloom-560m/snapshots/4f42c91d806a19ae1a46af6c3fb5f4990d884cd6",
# model="facebook/opt-125m",
tensor_parallel_size=1,
# max_num_seqs=1,
# max_num_batched_tokens=2048,
gpu_memory_utilization=0.95,
trust_remote_code=True)
def run_to_completion(sampling_params, dummy_prompt_token_ids, profile: bool = False):
if profile:
torch.cuda.cudart().cudaProfilerStart()
torch.cuda.synchronize()
start_time = time.time()
llm.generate(prompt_token_ids=dummy_prompt_token_ids,
sampling_params=sampling_params,
use_tqdm=False)
torch.cuda.synchronize()
end_time = time.time()
latency = end_time - start_time
if profile:
torch.cuda.cudart().cudaProfilerStop()
return latency
batch = 32
input_len = 512
out_len = 256
sampling_params = SamplingParams(
n=1,
temperature=1.0,
top_p=1.0,
use_beam_search=False,
ignore_eos=True,
max_tokens=out_len,
)
dummy_prompt_token_ids = []
dummy_prompt_token_ids_s = torch.randint(1, 10240, (batch, input_len))
for t in range(batch):
a = []
for i in range(input_len):
a.append(i)
dummy_prompt_token_ids.append(a) # print(dummy_prompt_token_ids)
# print("Warming up...")
for i in range(2):
run_to_completion(sampling_params, dummy_prompt_token_ids, profile=False)
# Benchmark.
#latencies = []
#for _ in range(5): #tqdm(range(5), desc="Profiling iterations"):
# latencies.append(run_to_completion(sampling_params, dummy_prompt_token_ids, profile=False))
#prefill_avg_latency = np.mean(latencies)
#print(f'prefill latency: {prefill_avg_latency*1000 / out_len} ms')
# print(f'Avg throughput: {out_len/avg_latency} tokens/seconds')
#out_len = 128
#sampling_params = SamplingParams(
# n=1,
# temperature=1.0,
# top_p=1.0,
# use_beam_search=False,
# ignore_eos=True,
# max_tokens=out_len,
#)
dummy_prompt_token_ids = []
dummy_prompt_token_ids_s = torch.randint(1, 10240, (batch, input_len))
for t in range(batch):
a = []
for i in range(input_len):
a.append(i)
dummy_prompt_token_ids.append(a)
latencies = []
for _ in range(5): #tqdm(range(5), desc="Profiling iterations"):
latencies.append(run_to_completion(sampling_params, dummy_prompt_token_ids, profile=False))
avg_latency = np.mean(latencies)
print(f'Avg latency: {avg_latency*1000} ms')
# print(f'Decode throughput: {batch*out_len/(avg_latency - prefill_avg_latency)} tokens/s')
print(f'total throughput: {batch*out_len/avg_latency} tokens/s')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment