Created
April 23, 2024 16:34
-
-
Save EdwinB12/f3d88b918008c27978032cdb2c99cd38 to your computer and use it in GitHub Desktop.
LLama 3 - Local Deployment
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# pip install pytorch, transformers, bitsandbytes, huggingface_hub, accelerate (maybe) | |
from huggingface_hub import login | |
from transformers import AutoTokenizer, AutoModelForCausalLM | |
if __name__ == "__main__": | |
# Login to Hugging Face | |
login("<INSERT HUGGINGFACE TOKEN>") | |
# Define Model Name | |
model_name = "meta-llama/Meta-Llama-3-8B" | |
# Get tokenizer and Model | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
model = AutoModelForCausalLM.from_pretrained( | |
model_name, | |
load_in_4bit=True, | |
device_map="cuda", | |
) | |
# Define Prompt | |
prompt = ["Python is the best computing language because"] | |
# Generate Response | |
input_ids = tokenizer(prompt, return_tensors="pt")['input_ids'] | |
# Move input_ids to GPU | |
input_ids = input_ids.to(model.device) | |
# Generate Response | |
response = model.generate( | |
input_ids, | |
max_new_tokens=256, | |
do_sample=True, | |
temperature=1.2, | |
top_p=0.9, | |
) | |
# Decode response back into words | |
output = tokenizer.decode(response[0], skip_special_tokens=True) | |
print(output) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment