Skip to content

Instantly share code, notes, and snippets.

@toranb
Created April 20, 2024 15:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save toranb/3704cb09f8b6aa3b2195c8f2c19e8af3 to your computer and use it in GitHub Desktop.
Save toranb/3704cb09f8b6aa3b2195c8f2c19e8af3 to your computer and use it in GitHub Desktop.
bumblebee hack to run llama 3
def llama() do
llama = {:hf, "meta-llama/Meta-Llama-3-8B-Instruct", auth_token: "abc123"}
{:ok, model_info} = Bumblebee.load_model(llama, type: :bf16, backend: {EXLA.Backend, client: :cuda})
{:ok, tokenizer} = Bumblebee.load_tokenizer(llama)
{:ok, generation_config} = Bumblebee.load_generation_config(llama)
tokenizer =
tokenizer
|> Map.put(:special_tokens, %{
pad: "<|eot_id|>",
bos: "<|begin_of_text|>",
eos: "<|eot_id|>",
# sep: "<|end_of_text|>",
unk: "<unk>"
})
generation_config = Bumblebee.configure(generation_config, temperature: 0.9, max_new_tokens: 450, strategy: %{type: :multinomial_sampling, top_p: 0.9, top_k: 48})
Bumblebee.Text.generation(model_info, tokenizer, generation_config, stream: true, compile: [batch_size: 1, sequence_length: [1024, 2048]], defn_options: [compiler: EXLA])
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment