Last active
August 12, 2022 09:03
-
-
Save younesbelkada/e022edaa36372ba36ce81f16c3bfc68c to your computer and use it in GitHub Desktop.
Benchmarking inference
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
import datetime | |
import torch | |
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline | |
def get_args(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument("--checkpoint", type=str, help="Checkpoint path", required=True) | |
parser.add_argument("--max-memory-per-gpu", type=str, help="Defines maximum memory allocated to gpu", required=True) | |
parser.add_argument("--seq_len", type=int, default=20, help="max generation length") | |
parser.add_argument("--batch_size", type=int, default=8) | |
parser.add_argument("--nb_gpus", type=int, default=8) | |
parser.add_argument('--load_8bit', action='store_true') | |
return parser.parse_args() | |
def get_gpus_max_memory(max_memory, nb_gpus): | |
max_memory = {i: max_memory for i in range(torch.cuda.device_count())[:nb_gpus]} | |
return max_memory | |
def main(): | |
args = get_args() | |
tokenizer = AutoTokenizer.from_pretrained(args.checkpoint, padding_side="left") | |
batch_size = args.batch_size | |
max_len = args.seq_len | |
print("Loaded tokenizer!") | |
texts = [] | |
# Load batch of texts | |
for _ in range(batch_size): | |
texts.append("test") | |
batch = tokenizer(texts, return_tensors='pt') | |
input_ids = batch["input_ids"].to(0) | |
attention_mask = batch["attention_mask"].to(0) | |
# Model instantiation | |
print("Loading model") | |
start = datetime.datetime.now() | |
model = AutoModelForCausalLM.from_pretrained( | |
args.checkpoint, | |
device_map="auto", | |
max_memory=get_gpus_max_memory(args.max_memory_per_gpu, args.nb_gpus), | |
load_in_8bit=args.load_8bit, | |
) | |
print(f"Loaded model in {datetime.datetime.now() - start}") | |
# Memory footprint check | |
mem = model.get_memory_footprint() | |
print("Memory footprint: {}".format(mem)) | |
try: | |
# Dummy run to initialize the benchmark | |
_ = model.generate(input_ids, attention_mask=attention_mask, max_new_tokens=max_len) | |
# Real run | |
start = datetime.datetime.now() | |
_ = model.generate(input_ids, attention_mask=attention_mask, max_new_tokens=max_len) | |
print(f"Completed pipeline in {datetime.datetime.now() - start}") | |
except: | |
print("oom") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Instruction to create the right virutal environement:
Tested on 4xA100 80GB node on Jean Zay supercomputer. I would suggest to use a conda virtual environment to run the experiments instead of a python
venv
. Make sure you are using a recent generation of NVIDIA GPU (A100 or T4 GPU) to be compatible withbitsandbytes
.Create the conda env
Get transformers + latest accelerate
Get bitsandbytes
Troubleshooting
If you get an error like:
NVIDIA A100-SXM4-80GB with CUDA capability sm_80 is not compatible with the current PyTorch installation. The current PyTorch install supports CUDA capabilities sm_37 sm_50 sm_60 sm_70. If you want to use the NVIDIA A100-SXM4-80GB GPU with PyTorch, please check the instructions at https://pytorch.org/get-started/locally/
Please run: