# !pip install -qU transformers accelerate einops langchain xformers
from torch import cuda, bfloat16
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
# Initialize the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained("mosaicml/mpt-7b-chat", trust_remote_code=True)
# Initialize model with Triton optimization. This is supposed to speed up the
# model at the cost of using more mem, but I haven't been able to get it to work
# yet
# optimize = False
# if optimize:
# config = AutoConfig.from_pretrained(
# 'mosaicml/mpt-7b-chat',
# trust_remote_code=True
# )
# config.attn_config['attn_impl'] = 'triton'
# # config.update({"init_device": "meta"}) # This causes an issue when calling
# config.update({"max_seq_len": 100})
# else:
# config={"init_device": "meta"}
config={"init_device": "meta"}
model = AutoModelForCausalLM.from_pretrained("mosaicml/mpt-7b-chat",
# tokenizer.eval() # fails!
# model.eval() # TODO: needed?
import time
from IPython.display import Markdown
def ask_question(question, max_length=100):
start_time = time.time()
# Encode the question
input_ids = tokenizer.encode(question, return_tensors='pt')
input_ids =
# input_ids ='cuda')
# mtp-7b is trained to add "<|endoftext|>" at the end of generations
stop_token_ids = tokenizer.convert_tokens_to_ids(["<|endoftext|>"])
import torch
from transformers import StoppingCriteria, StoppingCriteriaList
# define custom stopping criteria object
class StopOnTokens(StoppingCriteria):
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
for stop_id in stop_token_ids:
if input_ids[0][-1] == stop_id:
return True
return False
stopping_criteria = StoppingCriteriaList([StopOnTokens()])
# Generate a response
output = model.generate(
# max_length=1000,
# pad_token_id=stop_token_ids[0],
# num_return_sequences=1,
# top_p=0.15, # select from top tokens whose probability add up to 15%
# top_k=0, # select from top 0 tokens (because zero, relies on top_p)
# max_new_tokens=64, # max number of tokens to generate in the output
#repetition_penalty=1.1 # without this output begins repeating
# Decode the response
response = tokenizer.decode(output[:, input_ids.shape[-1]:][0], skip_special_tokens=True)
end_time = time.time()
duration = end_time - start_time
print("Function duration:", duration, "seconds")
# Ask a question
ask_question("What is the capital of France?")
# ask_question("Explain to me the difference between nuclear fission and fusion.", 200)
# ask_question("write python code that converts a csv into a pdf", 400)
