EdwinB12/run_llama3.py

## run_llama3.py
# pip install pytorch, transformers, bitsandbytes, huggingface_hub, accelerate (maybe)

from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM

if __name__ == "__main__":

    # Login to Hugging Face
    login("<INSERT HUGGINGFACE TOKEN>")

    # Define Model Name
    model_name = "meta-llama/Meta-Llama-3-8B"

    # Get tokenizer and Model
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    model = AutoModelForCausalLM.from_pretrained(
                model_name,
                load_in_4bit=True,
                device_map="cuda",
            )

    # Define Prompt
    prompt = ["Python is the best computing language because"]

    # Generate Response
    input_ids = tokenizer(prompt,  return_tensors="pt")['input_ids']

    # Move input_ids to GPU
    input_ids = input_ids.to(model.device)

    # Generate Response
    response = model.generate(
                input_ids,
                max_new_tokens=256,
                do_sample=True,
                temperature=1.2,
                top_p=0.9,
            )

    # Decode response back into words
    output = tokenizer.decode(response[0], skip_special_tokens=True)

    print(output)
	# pip install pytorch, transformers, bitsandbytes, huggingface_hub, accelerate (maybe)

	from huggingface_hub import login
	from transformers import AutoTokenizer, AutoModelForCausalLM

	if __name__ == "__main__":

	# Login to Hugging Face
	login("<INSERT HUGGINGFACE TOKEN>")

	# Define Model Name
	model_name = "meta-llama/Meta-Llama-3-8B"

	# Get tokenizer and Model
	tokenizer = AutoTokenizer.from_pretrained(model_name)

	model = AutoModelForCausalLM.from_pretrained(
	model_name,
	load_in_4bit=True,
	device_map="cuda",
	)

	# Define Prompt
	prompt = ["Python is the best computing language because"]

	# Generate Response
	input_ids = tokenizer(prompt, return_tensors="pt")['input_ids']

	# Move input_ids to GPU
	input_ids = input_ids.to(model.device)

	# Generate Response
	response = model.generate(
	input_ids,
	max_new_tokens=256,
	do_sample=True,
	temperature=1.2,
	top_p=0.9,
	)

	# Decode response back into words
	output = tokenizer.decode(response[0], skip_special_tokens=True)

	print(output)