Skip to content

Instantly share code, notes, and snippets.

@younesbelkada
Last active November 17, 2023 05:30
Show Gist options
  • Save younesbelkada/dba25f75d3749b4e2d2d4821f0d6f385 to your computer and use it in GitHub Desktop.
Save younesbelkada/dba25f75d3749b4e2d2d4821f0d6f385 to your computer and use it in GitHub Desktop.
Benchmarking the pipeline performance of int8 model
import time
import torch
import numpy as np
import argparse
from transformers import pipeline
parser = argparse.ArgumentParser(description='Benchmark pipeline runtime for int8 models')
parser.add_argument('--batch_size', default=1, type=int, help='batch_size for experiments')
parser.add_argument('--nb_runs', default=10, type=int, help='number of times for repeating experiments')
parser.add_argument('--nb_gpus', default=7, type=int, help='number of GPUs to use')
parser.add_argument('--seq_length', default=20, type=int, help='maximum number of tokens to generate')
parser.add_argument('--max_memory', default="30GB", type=str, help='Maximum memory to use for each GPU')
parser.add_argument('--load_8bit', action='store_true')
args = parser.parse_args()
NB_RUNS = args.nb_runs
BATCH_SIZE=args.batch_size
load_8bit = args.load_8bit
def get_input():
input_test = ["test" for _ in range(BATCH_SIZE)]
return input_test
def run_pipeline():
total_time = []
for _ in range(NB_RUNS):
start = time.perf_counter()
_ = pipe(input_test)
end = time.perf_counter()
torch.cuda.synchronize()
total_time.append(end-start)
return total_time
def get_gpus_max_memory(max_memory, n_gpus):
assert n_gpus <= torch.cuda.device_count(), "You are requesting more GPUs than available GPUs"
max_memory = {i: max_memory for i in range(n_gpus)}
return max_memory
input_test = get_input()
mapping_gpu_memory = get_gpus_max_memory(args.max_memory, args.nb_gpus)
pipe = pipeline(model="bigscience/bloom", model_kwargs= {"device_map": "auto", "torch_dtype": torch.float16 if load_8bit else torch.bfloat16, "load_in_8bit": load_8bit, "max_memory":mapping_gpu_memory}, max_new_tokens=args.seq_length, batch_size=args.batch_size)
# Do a dummy run
_ = pipe(input_test)
total_time = run_pipeline()
print("Time elapsed: {} +- {}".format(np.mean(total_time), np.std(total_time)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment