-
-
Save ilmsg/c7886a222dc86d4486239516c537ec2d to your computer and use it in GitHub Desktop.
modal script that creates llama2_image and defines a function that can run inference on it
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from modal import Image, Stub, Secret, gpu | |
from pathlib import Path | |
import os | |
MODEL_PATH = "/model" | |
def download_models(): | |
from transformers import AutoTokenizer, AutoModelForCausalLM | |
token = os.environ["HUGGINGFACE_TOKEN"] | |
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", use_auth_token=token) | |
tokenizer.save_pretrained(MODEL_PATH) | |
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", use_auth_token=token) | |
model.save_pretrained(MODEL_PATH) | |
## adapted from https://github.com/modal-labs/doppel-bot/blob/main/src/common.py | |
# versions might be out of date | |
llama2_image = ( | |
Image.micromamba() | |
.micromamba_install( | |
"cudatoolkit=11.7", | |
"cudnn=8.1.0", | |
"cuda-nvcc", | |
channels=["conda-forge", "nvidia"], | |
) | |
.apt_install("git") | |
.pip_install( | |
"accelerate==0.18.0", | |
"bitsandbytes==0.37.0", | |
"bitsandbytes-cuda117==0.26.0.post2", | |
"datasets==2.10.1", | |
"fire==0.5.0", | |
"gradio==3.23.0", | |
"peft @ git+https://github.com/huggingface/peft.git@e536616888d51b453ed354a6f1e243fecb02ea08", | |
"transformers @ git+https://github.com/huggingface/transformers.git@a92e0ad2e20ef4ce28410b5e05c5d63a5a304e65", | |
"torch==2.0.0", | |
"torchvision==0.15.1", | |
"sentencepiece==0.1.97", | |
) | |
.run_function(download_models, memory=32768, secret=Secret.from_name("hugging-face"), timeout=3600) | |
) | |
stub = Stub(name="llama2", image=llama2_image) | |
@stub.function( | |
gpu=gpu.A100(memory=40), | |
) | |
def main(): | |
""" | |
run this function: modal run modal_llama2::main | |
prereqs: | |
- modal hugging-face secret must be configured correctly | |
- you must have access to https://huggingface.co/meta-llama/Llama-2-7b-hf (request access on hugging face & https://ai.meta.com/resources/models-and-libraries/llama-downloads; took me ~1 hr to get approved) | |
first run will need to create the image, which takes ~20 mins (downloading ~30 GB from huggingface) | |
later runs take < 1 min | |
see this notebook for other things you can do with the model: https://github.com/facebookresearch/llama-recipes/blob/main/quickstart.ipynb | |
""" | |
from transformers import AutoTokenizer, AutoModelForCausalLM, LlamaForCausalLM | |
import torch | |
load_8bit = False | |
device = "cuda" | |
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH) | |
model: LlamaForCausalLM = AutoModelForCausalLM.from_pretrained( | |
MODEL_PATH, | |
load_in_8bit=load_8bit, | |
torch_dtype=torch.float16, | |
device_map="auto", | |
) | |
model.eval() | |
from transformers import GenerationConfig | |
# prompt copied from https://github.com/facebookresearch/llama-recipes/blob/main/quickstart.ipynb | |
prompt = """ | |
Summarize this dialog: | |
A: Hi Tom, are you busy tomorrow’s afternoon? | |
B: I’m pretty sure I am. What’s up? | |
A: Can you go with me to the animal shelter?. | |
B: What do you want to do? | |
A: I want to get a puppy for my son. | |
B: That will make him so happy. | |
A: Yeah, we’ve discussed it many times. I think he’s ready now. | |
B: That’s good. Raising a dog is a tough issue. Like having a baby ;-) | |
A: I'll get him one of those little dogs. | |
B: One that won't grow up too big;-) | |
A: And eat too much;-)) | |
B: Do you know which one he would like? | |
A: Oh, yes, I took him there last Monday. He showed me one that he really liked. | |
B: I bet you had to drag him away. | |
A: He wanted to take it home right away ;-). | |
B: I wonder what he'll name it. | |
A: He said he’d name it after his dead hamster – Lemmy - he's a great Motorhead fan :-))) | |
--- | |
Summary: | |
""" | |
inputs = tokenizer(prompt, return_tensors="pt") | |
input_ids = inputs["input_ids"].to(device) | |
# tokens = self.tokenizer.convert_ids_to_tokens(input_ids[0]) | |
# print(tokens) | |
generation_config = GenerationConfig() | |
with torch.no_grad(): | |
generation_output = model.generate( | |
input_ids=input_ids, | |
generation_config=generation_config, | |
# parameters below are set arbitrarily; a lot are just defaults | |
return_dict_in_generate=True, | |
output_scores=True, | |
do_sample=True, | |
temperature=0.3, | |
top_p=0.85, | |
top_k=40, | |
num_beams=1, | |
max_new_tokens=600, | |
repetition_penalty=1.2, | |
) | |
s = generation_output.sequences[0] | |
run_output = tokenizer.decode(s) | |
print("Run output:", run_output) | |
return run_output |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment