Skip to content

Instantly share code, notes, and snippets.

@ilmsg
Forked from stevenhao/modal_llama2.py
Created July 19, 2023 18:29
Show Gist options
  • Save ilmsg/c7886a222dc86d4486239516c537ec2d to your computer and use it in GitHub Desktop.
Save ilmsg/c7886a222dc86d4486239516c537ec2d to your computer and use it in GitHub Desktop.
modal script that creates llama2_image and defines a function that can run inference on it
from modal import Image, Stub, Secret, gpu
from pathlib import Path
import os
MODEL_PATH = "/model"
def download_models():
from transformers import AutoTokenizer, AutoModelForCausalLM
token = os.environ["HUGGINGFACE_TOKEN"]
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", use_auth_token=token)
tokenizer.save_pretrained(MODEL_PATH)
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", use_auth_token=token)
model.save_pretrained(MODEL_PATH)
## adapted from https://github.com/modal-labs/doppel-bot/blob/main/src/common.py
# versions might be out of date
llama2_image = (
Image.micromamba()
.micromamba_install(
"cudatoolkit=11.7",
"cudnn=8.1.0",
"cuda-nvcc",
channels=["conda-forge", "nvidia"],
)
.apt_install("git")
.pip_install(
"accelerate==0.18.0",
"bitsandbytes==0.37.0",
"bitsandbytes-cuda117==0.26.0.post2",
"datasets==2.10.1",
"fire==0.5.0",
"gradio==3.23.0",
"peft @ git+https://github.com/huggingface/peft.git@e536616888d51b453ed354a6f1e243fecb02ea08",
"transformers @ git+https://github.com/huggingface/transformers.git@a92e0ad2e20ef4ce28410b5e05c5d63a5a304e65",
"torch==2.0.0",
"torchvision==0.15.1",
"sentencepiece==0.1.97",
)
.run_function(download_models, memory=32768, secret=Secret.from_name("hugging-face"), timeout=3600)
)
stub = Stub(name="llama2", image=llama2_image)
@stub.function(
gpu=gpu.A100(memory=40),
)
def main():
"""
run this function: modal run modal_llama2::main
prereqs:
- modal hugging-face secret must be configured correctly
- you must have access to https://huggingface.co/meta-llama/Llama-2-7b-hf (request access on hugging face & https://ai.meta.com/resources/models-and-libraries/llama-downloads; took me ~1 hr to get approved)
first run will need to create the image, which takes ~20 mins (downloading ~30 GB from huggingface)
later runs take < 1 min
see this notebook for other things you can do with the model: https://github.com/facebookresearch/llama-recipes/blob/main/quickstart.ipynb
"""
from transformers import AutoTokenizer, AutoModelForCausalLM, LlamaForCausalLM
import torch
load_8bit = False
device = "cuda"
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model: LlamaForCausalLM = AutoModelForCausalLM.from_pretrained(
MODEL_PATH,
load_in_8bit=load_8bit,
torch_dtype=torch.float16,
device_map="auto",
)
model.eval()
from transformers import GenerationConfig
# prompt copied from https://github.com/facebookresearch/llama-recipes/blob/main/quickstart.ipynb
prompt = """
Summarize this dialog:
A: Hi Tom, are you busy tomorrow’s afternoon?
B: I’m pretty sure I am. What’s up?
A: Can you go with me to the animal shelter?.
B: What do you want to do?
A: I want to get a puppy for my son.
B: That will make him so happy.
A: Yeah, we’ve discussed it many times. I think he’s ready now.
B: That’s good. Raising a dog is a tough issue. Like having a baby ;-)
A: I'll get him one of those little dogs.
B: One that won't grow up too big;-)
A: And eat too much;-))
B: Do you know which one he would like?
A: Oh, yes, I took him there last Monday. He showed me one that he really liked.
B: I bet you had to drag him away.
A: He wanted to take it home right away ;-).
B: I wonder what he'll name it.
A: He said he’d name it after his dead hamster – Lemmy - he's a great Motorhead fan :-)))
---
Summary:
"""
inputs = tokenizer(prompt, return_tensors="pt")
input_ids = inputs["input_ids"].to(device)
# tokens = self.tokenizer.convert_ids_to_tokens(input_ids[0])
# print(tokens)
generation_config = GenerationConfig()
with torch.no_grad():
generation_output = model.generate(
input_ids=input_ids,
generation_config=generation_config,
# parameters below are set arbitrarily; a lot are just defaults
return_dict_in_generate=True,
output_scores=True,
do_sample=True,
temperature=0.3,
top_p=0.85,
top_k=40,
num_beams=1,
max_new_tokens=600,
repetition_penalty=1.2,
)
s = generation_output.sequences[0]
run_output = tokenizer.decode(s)
print("Run output:", run_output)
return run_output
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment