Skip to content

Instantly share code, notes, and snippets.

@younesbelkada
Created August 2, 2022 21:35
Show Gist options
  • Save younesbelkada/d1ea25d91f27c223c20f0f854d681f09 to your computer and use it in GitHub Desktop.
Save younesbelkada/d1ea25d91f27c223c20f0f854d681f09 to your computer and use it in GitHub Desktop.
Benchmark using `generate`
import time
import tokenizers
import torch
import numpy as np
import argparse
from transformers import AutoModelForCausalLM, AutoTokenizer
parser = argparse.ArgumentParser(description='Benchmark pipeline runtime for int8 models')
parser.add_argument('--batch_size', default=1, type=int, help='batch_size for experiments')
parser.add_argument('--nb_runs', default=10, type=int, help='number of times for repeating experiments')
parser.add_argument('--nb_gpus', default=7, type=int, help='number of GPUs to use')
parser.add_argument('--seq_length', default=20, type=int, help='maximum number of tokens to generate')
parser.add_argument('--max_memory', default="30GB", type=str, help='Maximum memory to use for each GPU')
parser.add_argument('--model_name', default="facebook/opt-1.3b", type=str, help='Model to benchmark')
parser.add_argument('--load_8bit', action='store_true')
args = parser.parse_args()
NB_RUNS = args.nb_runs
BATCH_SIZE=args.batch_size
MAX_LEN = args.seq_length
load_8bit = args.load_8bit
def get_input():
input_test = ["test" for _ in range(BATCH_SIZE)]
return input_test
def simple_generation(model, encoded_text, max_length):
_ = model.generate(encoded_text, max_length=max_length)
return
def run_pipeline(encoded_text, model, max_length):
total_time = []
for _ in range(NB_RUNS):
start = time.perf_counter()
simple_generation(model, encoded_text, max_length)
end = time.perf_counter()
torch.cuda.synchronize()
total_time.append(end-start)
return total_time
def get_gpus_max_memory(max_memory, n_gpus):
assert n_gpus <= torch.cuda.device_count(), "You are requesting more GPUs than available GPUs"
max_memory = {i: max_memory for i in range(n_gpus)}
return max_memory
input_test = get_input()
mapping_gpu_memory = get_gpus_max_memory(args.max_memory, args.nb_gpus)
model = AutoModelForCausalLM.from_pretrained(
args.model_name,
device_map="auto",
max_memory=mapping_gpu_memory,
torch_dtype="auto",
load_in_8bit=load_8bit
)
tokenizer = AutoTokenizer.from_pretrained(args.model_name)
encoded_text = tokenizer(input_test, return_tensors="pt")["input_ids"]
# Do a dummy run
simple_generation(model, encoded_text, MAX_LEN)
# Run the benchmark
total_time = run_pipeline(encoded_text, model, MAX_LEN)
print("Time elapsed: {} +- {}".format(np.mean(total_time), np.std(total_time)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment