conda create -n llama2 python=3.9
conda activate llama2
# langchain
pip install langchain
# llama-cpp-python
FORCE_CMAKE=1 pip install -U llama-cpp-python --no-cache-dir
Below are some links to the models with 4-bit quantization.
conda create -n llama2 python=3.9
conda activate llama2
# langchain
pip install langchain
# llama-cpp-python
FORCE_CMAKE=1 pip install -U llama-cpp-python --no-cache-dir
Below are some links to the models with 4-bit quantization.
from langchain.llms import LlamaCpp | |
from langchain.callbacks.manager import CallbackManager | |
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler | |
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()]) | |
llm = LlamaCpp( | |
model_path="{MODEL_FILE_PATH}", | |
n_gpu_layers=0, | |
temperature=0.1, | |
top_p=0.95, | |
repetition_penalty=1, | |
f16_kv=True, | |
callback_manager=callback_manager, | |
verbose=True | |
) | |
prompt_template = """\ | |
You are a helpful assistant. | |
You do not respond as 'User' or pretend to be 'User'. | |
You only respond once as Assistant. | |
User: {query} | |
""" | |
def llama2(query): | |
prompt = prompt_template.format(query=query) | |
response = llm(prompt) | |
return response | |
# use this function to get response from Llama2 model | |
llama2("Hello") |