toranb/llama_three.ex

## llama_three.ex
  def llama() do
    llama = {:hf, "meta-llama/Meta-Llama-3-8B-Instruct", auth_token: "abc123"}
    {:ok, model_info} = Bumblebee.load_model(llama, type: :bf16, backend: {EXLA.Backend, client: :cuda})
    {:ok, tokenizer} = Bumblebee.load_tokenizer(llama)
    {:ok, generation_config} = Bumblebee.load_generation_config(llama)

    tokenizer =
      tokenizer
      |> Map.put(:special_tokens, %{
        pad: "<|eot_id|>",
        bos: "<|begin_of_text|>",
        eos: "<|eot_id|>",
        # sep: "<|end_of_text|>",
        unk: "<unk>"
      })

    generation_config = Bumblebee.configure(generation_config, temperature: 0.9, max_new_tokens: 450, strategy: %{type: :multinomial_sampling, top_p: 0.9, top_k: 48})
    Bumblebee.Text.generation(model_info, tokenizer, generation_config, stream: true, compile: [batch_size: 1, sequence_length: [1024, 2048]], defn_options: [compiler: EXLA])
  end
	def llama() do
	llama = {:hf, "meta-llama/Meta-Llama-3-8B-Instruct", auth_token: "abc123"}
	{:ok, model_info} = Bumblebee.load_model(llama, type: :bf16, backend: {EXLA.Backend, client: :cuda})
	{:ok, tokenizer} = Bumblebee.load_tokenizer(llama)
	{:ok, generation_config} = Bumblebee.load_generation_config(llama)

	tokenizer =
	tokenizer
	\|> Map.put(:special_tokens, %{
	pad: "<\|eot_id\|>",
	bos: "<\|begin_of_text\|>",
	eos: "<\|eot_id\|>",
	# sep: "<\|end_of_text\|>",
	unk: "<unk>"
	})

	generation_config = Bumblebee.configure(generation_config, temperature: 0.9, max_new_tokens: 450, strategy: %{type: :multinomial_sampling, top_p: 0.9, top_k: 48})
	Bumblebee.Text.generation(model_info, tokenizer, generation_config, stream: true, compile: [batch_size: 1, sequence_length: [1024, 2048]], defn_options: [compiler: EXLA])
	end