Skip to content

Instantly share code, notes, and snippets.

@EdwinB12
Created April 23, 2024 16:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save EdwinB12/f3d88b918008c27978032cdb2c99cd38 to your computer and use it in GitHub Desktop.
Save EdwinB12/f3d88b918008c27978032cdb2c99cd38 to your computer and use it in GitHub Desktop.
LLama 3 - Local Deployment
# pip install pytorch, transformers, bitsandbytes, huggingface_hub, accelerate (maybe)
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM
if __name__ == "__main__":
# Login to Hugging Face
login("<INSERT HUGGINGFACE TOKEN>")
# Define Model Name
model_name = "meta-llama/Meta-Llama-3-8B"
# Get tokenizer and Model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
model_name,
load_in_4bit=True,
device_map="cuda",
)
# Define Prompt
prompt = ["Python is the best computing language because"]
# Generate Response
input_ids = tokenizer(prompt, return_tensors="pt")['input_ids']
# Move input_ids to GPU
input_ids = input_ids.to(model.device)
# Generate Response
response = model.generate(
input_ids,
max_new_tokens=256,
do_sample=True,
temperature=1.2,
top_p=0.9,
)
# Decode response back into words
output = tokenizer.decode(response[0], skip_special_tokens=True)
print(output)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment