Skip to content

Instantly share code, notes, and snippets.

@priamai
Created November 30, 2023 11:45
Show Gist options
  • Save priamai/61aa332c42b89f518dcf134c38dd593d to your computer and use it in GitHub Desktop.
Save priamai/61aa332c42b89f518dcf134c38dd593d to your computer and use it in GitHub Desktop.
from modal import Image
import modal
def download_model():
from transformers import AutoTokenizer, AutoModelForCausalLM
tokenizer = AutoTokenizer.from_pretrained("garage-bAInd/Platypus2-70B-instruct")
model = AutoModelForCausalLM.from_pretrained("garage-bAInd/Platypus2-70B-instruct")
return tokenizer, model
gpu_image = Image.debian_slim().pip_install("airllm")
gpu_image.run_function(download_model)
stub = modal.Stub(name="airllm-test",gpu_image=gpu_image)
if stub.is_inside(stub.gpu_image):
from airllm import AirLLMLlama2
import subprocess
MAX_LENGTH = 128
model = AirLLMLlama2("garage-bAInd/Platypus2-70B-instruct")
@stub.function(gpu="T4",timeout=60*20,image=stub.gpu_image)
def generate(input_text=[]):
import time
# check the GPU type
subprocess.run(["nvidia-smi"])
timestamp1 = time.time()
input_tokens = model.tokenizer(input_text,
return_tensors="pt",
return_attention_mask=False,
truncation=True,
max_length=MAX_LENGTH,
padding=True)
generation_output = model.generate(
input_tokens['input_ids'].cuda(),
#max_new_tokens=2,
use_cache=True,
return_dict_in_generate=True)
output = model.tokenizer.decode(generation_output.sequences[0])
# Your code here
timestamp2 = time.time()
print(output)
print("This took %.2f seconds" % (timestamp2 - timestamp1))
@stub.local_entrypoint()
def main(input_text = 'What is the capital of United States?'):
try:
generate.remote([input_text])
except modal.exception.FunctionTimeoutError as e:
print(e)
@priamai
Copy link
Author

priamai commented Nov 30, 2023

What is the capital of United States?
Washington, D.C.

This took 456.63 seconds

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment