4dh/LlamaCPP_sample_GRDN.py

## LlamaCPP_sample_GRDN.py
# import libraries
from llama_index import (
    SimpleDirectoryReader,
    VectorStoreIndex,
    ServiceContext,
)
from llama_index.llms import LlamaCPP
from llama_index.llms.llama_utils import (
    messages_to_prompt,
    completion_to_prompt,
)

model_url = "https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/resolve/main/llama-2-13b-chat.ggmlv3.q4_0.bin"
model_path = "/Users/dheym/Library/CloudStorage/OneDrive-Personal/Documents/side_projects/GRDN/src/models/llama-2-7b-chat.Q4_K_M.gguf"
# Initialize LlamaCPP with a local model for natural language processing

llm = LlamaCPP(
    # Specify the local path to your pre-downloaded Llama model
    model_path=model_path,
    # Set temperature for controlled randomness in generation (0.1 for more deterministic)
    temperature=0.1,
    # Limit the number of new tokens to generate (1000 tokens for extensive responses)
    max_new_tokens=1000,
    # Define the context window size for the model (set below max to avoid token limit issues)
    context_window=3000,
    # Additional arguments for model generation can be passed here if needed
    generate_kwargs={},
    # Model initialization arguments, including GPU layer settings (may adjust based on hardware)
    model_kwargs={
        "n_gpu_layers": 1
    },  # For M2 Max, confirm optimal settings from documentation
    # Functions to format the prompts and completions for Llama model compatibility
    messages_to_prompt=messages_to_prompt,
    completion_to_prompt=completion_to_prompt,
    # Enable verbose logging for detailed output (useful for development and debugging)
    verbose=True,
)
response = llm.complete(
    "Hello! tell me 3 short, concise bullet points about companion planting."
)
print(response.text)
	# import libraries
	from llama_index import (
	SimpleDirectoryReader,
	VectorStoreIndex,
	ServiceContext,
	)
	from llama_index.llms import LlamaCPP
	from llama_index.llms.llama_utils import (
	messages_to_prompt,
	completion_to_prompt,
	)

	model_url = "https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/resolve/main/llama-2-13b-chat.ggmlv3.q4_0.bin"
	model_path = "/Users/dheym/Library/CloudStorage/OneDrive-Personal/Documents/side_projects/GRDN/src/models/llama-2-7b-chat.Q4_K_M.gguf"
	# Initialize LlamaCPP with a local model for natural language processing

	llm = LlamaCPP(
	# Specify the local path to your pre-downloaded Llama model
	model_path=model_path,
	# Set temperature for controlled randomness in generation (0.1 for more deterministic)
	temperature=0.1,
	# Limit the number of new tokens to generate (1000 tokens for extensive responses)
	max_new_tokens=1000,
	# Define the context window size for the model (set below max to avoid token limit issues)
	context_window=3000,
	# Additional arguments for model generation can be passed here if needed
	generate_kwargs={},
	# Model initialization arguments, including GPU layer settings (may adjust based on hardware)
	model_kwargs={
	"n_gpu_layers": 1
	}, # For M2 Max, confirm optimal settings from documentation
	# Functions to format the prompts and completions for Llama model compatibility
	messages_to_prompt=messages_to_prompt,
	completion_to_prompt=completion_to_prompt,
	# Enable verbose logging for detailed output (useful for development and debugging)
	verbose=True,
	)
	response = llm.complete(
	"Hello! tell me 3 short, concise bullet points about companion planting."
	)
	print(response.text)