I use the following code, running on perlmutter.nersc.gov, single node, 4 A100 GPUs, each 80 GB memory:
Timing information
- Loading tokenizer and model: took 749.432817184017 seconds to execute.
- Creating pipepline: took 0.0001321239396929741 seconds to execute.
- Inferencing using the model: took 455.8312628919957 seconds to execute.
- this time depends on the output token length specified, I use 512 tokens. Longer lengths require even longer inferencing time.
# Use a pipeline as a high-level helper
from transformers import pipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
import time
import torch
#from accelerate import init_empty_weights, load_checkpoint_and_dispatch
import accelerate
t1= time.perf_counter()
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-70b-chat-hf")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-70b-chat-hf", device_map="auto")
t2= time.perf_counter()
print(f"Loading tokenizer and model: took {t2-t1} seconds to execute.")
# Create a pipeline
code_generator = pipeline('text-generation', model=model, tokenizer=tokenizer)
t3= time.perf_counter()
print(f"Creating pipepline: took {t3-t2} seconds to execute.")
# Generate code for an input string
while True:
print("\n=========Please type in your question=========================\n")
user_content = input("\nQuestion: ") # User question
user_content.strip()
t1= time.perf_counter()
generated_code = code_generator(user_content, max_length=512)[0]['generated_text']
t2= time.perf_counter()
print(f"Inferencing using the model: took {t2-t1} seconds to execute.")
print(generated_code)