ilmsg/modal_llama2.py

## modal_llama2.py
from modal import Image, Stub, Secret, gpu
from pathlib import Path
import os

MODEL_PATH = "/model"

def download_models():
    from transformers import AutoTokenizer, AutoModelForCausalLM
    token = os.environ["HUGGINGFACE_TOKEN"]
    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", use_auth_token=token)
    tokenizer.save_pretrained(MODEL_PATH)
    model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", use_auth_token=token)
    model.save_pretrained(MODEL_PATH)


## adapted from https://github.com/modal-labs/doppel-bot/blob/main/src/common.py
# versions might be out of date
llama2_image = (
    Image.micromamba()
    .micromamba_install(
        "cudatoolkit=11.7",
        "cudnn=8.1.0",
        "cuda-nvcc",
        channels=["conda-forge", "nvidia"],
    )
    .apt_install("git")
    .pip_install(
        "accelerate==0.18.0",
        "bitsandbytes==0.37.0",
        "bitsandbytes-cuda117==0.26.0.post2",
        "datasets==2.10.1",
        "fire==0.5.0",
        "gradio==3.23.0",
        "peft @ git+https://github.com/huggingface/peft.git@e536616888d51b453ed354a6f1e243fecb02ea08",
        "transformers @ git+https://github.com/huggingface/transformers.git@a92e0ad2e20ef4ce28410b5e05c5d63a5a304e65",
        "torch==2.0.0",
        "torchvision==0.15.1",
        "sentencepiece==0.1.97",
    )
    .run_function(download_models, memory=32768, secret=Secret.from_name("hugging-face"), timeout=3600)
)

stub = Stub(name="llama2", image=llama2_image)

@stub.function(
    gpu=gpu.A100(memory=40),
)
def main():
    """
    run this function: modal run modal_llama2::main
    prereqs:
    - modal hugging-face secret must be configured correctly
    - you must have access to https://huggingface.co/meta-llama/Llama-2-7b-hf (request access on hugging face & https://ai.meta.com/resources/models-and-libraries/llama-downloads; took me ~1 hr to get approved)

        first run will need to create the image, which takes ~20 mins (downloading ~30 GB from huggingface)
    later runs take < 1 min
    see this notebook for other things you can do with the model: https://github.com/facebookresearch/llama-recipes/blob/main/quickstart.ipynb
    """
    from transformers import AutoTokenizer, AutoModelForCausalLM, LlamaForCausalLM
    import torch
    load_8bit = False
    device = "cuda"

    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
    model: LlamaForCausalLM = AutoModelForCausalLM.from_pretrained(
        MODEL_PATH,
        load_in_8bit=load_8bit,
        torch_dtype=torch.float16,
        device_map="auto",
    )
    model.eval()
    from transformers import GenerationConfig

    # prompt copied from https://github.com/facebookresearch/llama-recipes/blob/main/quickstart.ipynb
    prompt = """
Summarize this dialog:
A: Hi Tom, are you busy tomorrow’s afternoon?
B: I’m pretty sure I am. What’s up?
A: Can you go with me to the animal shelter?.
B: What do you want to do?
A: I want to get a puppy for my son.
B: That will make him so happy.
A: Yeah, we’ve discussed it many times. I think he’s ready now.
B: That’s good. Raising a dog is a tough issue. Like having a baby ;-)
A: I'll get him one of those little dogs.
B: One that won't grow up too big;-)
A: And eat too much;-))
B: Do you know which one he would like?
A: Oh, yes, I took him there last Monday. He showed me one that he really liked.
B: I bet you had to drag him away.
A: He wanted to take it home right away ;-).
B: I wonder what he'll name it.
A: He said he’d name it after his dead hamster – Lemmy  - he's  a great Motorhead fan :-)))
---
Summary:
"""

    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = inputs["input_ids"].to(device)
    # tokens = self.tokenizer.convert_ids_to_tokens(input_ids[0])
    # print(tokens)
    generation_config = GenerationConfig()

    with torch.no_grad():
        generation_output = model.generate(
            input_ids=input_ids,
            generation_config=generation_config,
            # parameters below are set arbitrarily; a lot are just defaults
            return_dict_in_generate=True,
            output_scores=True,
            do_sample=True,
            temperature=0.3,
            top_p=0.85,
            top_k=40,
            num_beams=1,
            max_new_tokens=600,
            repetition_penalty=1.2,
        )

        s = generation_output.sequences[0]
        run_output = tokenizer.decode(s)
        print("Run output:", run_output)
        return run_output
	from modal import Image, Stub, Secret, gpu
	from pathlib import Path
	import os

	MODEL_PATH = "/model"

	def download_models():
	from transformers import AutoTokenizer, AutoModelForCausalLM
	token = os.environ["HUGGINGFACE_TOKEN"]
	tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", use_auth_token=token)
	tokenizer.save_pretrained(MODEL_PATH)
	model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", use_auth_token=token)
	model.save_pretrained(MODEL_PATH)


	## adapted from https://github.com/modal-labs/doppel-bot/blob/main/src/common.py
	# versions might be out of date
	llama2_image = (
	Image.micromamba()
	.micromamba_install(
	"cudatoolkit=11.7",
	"cudnn=8.1.0",
	"cuda-nvcc",
	channels=["conda-forge", "nvidia"],
	)
	.apt_install("git")
	.pip_install(
	"accelerate==0.18.0",
	"bitsandbytes==0.37.0",
	"bitsandbytes-cuda117==0.26.0.post2",
	"datasets==2.10.1",
	"fire==0.5.0",
	"gradio==3.23.0",
	"peft @ git+https://github.com/huggingface/peft.git@e536616888d51b453ed354a6f1e243fecb02ea08",
	"transformers @ git+https://github.com/huggingface/transformers.git@a92e0ad2e20ef4ce28410b5e05c5d63a5a304e65",
	"torch==2.0.0",
	"torchvision==0.15.1",
	"sentencepiece==0.1.97",
	)
	.run_function(download_models, memory=32768, secret=Secret.from_name("hugging-face"), timeout=3600)
	)

	stub = Stub(name="llama2", image=llama2_image)

	@stub.function(
	gpu=gpu.A100(memory=40),
	)
	def main():
	"""
	run this function: modal run modal_llama2::main
	prereqs:
	- modal hugging-face secret must be configured correctly
	- you must have access to https://huggingface.co/meta-llama/Llama-2-7b-hf (request access on hugging face & https://ai.meta.com/resources/models-and-libraries/llama-downloads; took me ~1 hr to get approved)

	first run will need to create the image, which takes ~20 mins (downloading ~30 GB from huggingface)
	later runs take < 1 min
	see this notebook for other things you can do with the model: https://github.com/facebookresearch/llama-recipes/blob/main/quickstart.ipynb
	"""
	from transformers import AutoTokenizer, AutoModelForCausalLM, LlamaForCausalLM
	import torch
	load_8bit = False
	device = "cuda"

	tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
	model: LlamaForCausalLM = AutoModelForCausalLM.from_pretrained(
	MODEL_PATH,
	load_in_8bit=load_8bit,
	torch_dtype=torch.float16,
	device_map="auto",
	)
	model.eval()
	from transformers import GenerationConfig

	# prompt copied from https://github.com/facebookresearch/llama-recipes/blob/main/quickstart.ipynb
	prompt = """
	Summarize this dialog:
	A: Hi Tom, are you busy tomorrow’s afternoon?
	B: I’m pretty sure I am. What’s up?
	A: Can you go with me to the animal shelter?.
	B: What do you want to do?
	A: I want to get a puppy for my son.
	B: That will make him so happy.
	A: Yeah, we’ve discussed it many times. I think he’s ready now.
	B: That’s good. Raising a dog is a tough issue. Like having a baby ;-)
	A: I'll get him one of those little dogs.
	B: One that won't grow up too big;-)
	A: And eat too much;-))
	B: Do you know which one he would like?
	A: Oh, yes, I took him there last Monday. He showed me one that he really liked.
	B: I bet you had to drag him away.
	A: He wanted to take it home right away ;-).
	B: I wonder what he'll name it.
	A: He said he’d name it after his dead hamster – Lemmy - he's a great Motorhead fan :-)))
	---
	Summary:
	"""

	inputs = tokenizer(prompt, return_tensors="pt")
	input_ids = inputs["input_ids"].to(device)
	# tokens = self.tokenizer.convert_ids_to_tokens(input_ids[0])
	# print(tokens)
	generation_config = GenerationConfig()

	with torch.no_grad():
	generation_output = model.generate(
	input_ids=input_ids,
	generation_config=generation_config,
	# parameters below are set arbitrarily; a lot are just defaults
	return_dict_in_generate=True,
	output_scores=True,
	do_sample=True,
	temperature=0.3,
	top_p=0.85,
	top_k=40,
	num_beams=1,
	max_new_tokens=600,
	repetition_penalty=1.2,
	)

	s = generation_output.sequences[0]
	run_output = tokenizer.decode(s)
	print("Run output:", run_output)
	return run_output