bigsnarfdude/l.py

## l.py
from llama_cpp import Llama
llm = Llama(
      model_path="/Users/vincent/development/llama.cpp/models/mistral-7b-instruct-v0.2.Q4_K_M.gguf",
      n_gpu_layers=-1, # Uncomment to use GPU acceleration
      seed=1337,
      n_ctx=2048)
output = llm(
      "[INST] Question: Write a paper on the industrial revolution. Answer: [/INST]", # Prompt
      max_tokens=None, # Generate up to 32 tokens, set to None to generate up to the end of the context window
      #stop=["Q:", "\n"], # Stop generating just before the model would generate a new question
      echo=True # Echo the prompt back in the output
) # Generate a completion, can also call create_completion
print(output['choices'][0]["text"])
	from llama_cpp import Llama
	llm = Llama(
	model_path="/Users/vincent/development/llama.cpp/models/mistral-7b-instruct-v0.2.Q4_K_M.gguf",
	n_gpu_layers=-1, # Uncomment to use GPU acceleration
	seed=1337,
	n_ctx=2048)
	output = llm(
	"[INST] Question: Write a paper on the industrial revolution. Answer: [/INST]", # Prompt
	max_tokens=None, # Generate up to 32 tokens, set to None to generate up to the end of the context window
	#stop=["Q:", "\n"], # Stop generating just before the model would generate a new question
	echo=True # Echo the prompt back in the output
	) # Generate a completion, can also call create_completion
	print(output['choices'][0]["text"])