Skip to content

Instantly share code, notes, and snippets.

@vuiseng9
Created April 12, 2024 04:28
Show Gist options
  • Save vuiseng9/1bbba77c9477f59adf530ba48a98e8ae to your computer and use it in GitHub Desktop.
Save vuiseng9/1bbba77c9477f59adf530ba48a98e8ae to your computer and use it in GitHub Desktop.
import intel_extension_for_pytorch # requried for XPU
import torch
from bigdl.llm.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer, pipeline

# model_id = "facebook/opt-1.3b"
# model_id = "meta-llama/Llama-2-7b"
model_id = "meta-llama/Llama-2-7b-chat-hf"
prompt = "I love the Avengers,"

#load Hugging Face Transformers model with INT4 optimizations
# model = AutoModelForCausalLM.from_pretrained(model_id, load_in_4bit=True)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="xpu", torch_dtype=torch.float16)

tokenizer = AutoTokenizer.from_pretrained(model_id) # tokenizer doesnt support .to
model = model.to('xpu')
input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
output_ids = model.generate(input_ids)
output = tokenizer.batch_decode(output_ids.cpu())
print(output)


print("joto")
@vuiseng9
Copy link
Author

import intel_extension_for_pytorch # requried for XPU

from bigdl.llm.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer, pipeline

model_id = "facebook/opt-1.3b"
prompt = "I love the Avengers,"

#load Hugging Face Transformers model with INT4 optimizations
model = AutoModelForCausalLM.from_pretrained(model_id, load_in_4bit=True)
tokenizer = AutoTokenizer.from_pretrained(model_id) # tokenizer doesnt support .to
model = model.to('xpu')
input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
output_ids = model.generate(input_ids)
output = tokenizer.batch_decode(output_ids.cpu())
print(output)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment