bigsnarfdude/local_llm.py

## local_llm.py
'''
self host an inference server with basic rag support:
- get a Mac Mini or Mac Studio - just run ollama serve, - run ollama web-ui in docker - add some coding assitant model from ollamahub with the web-ui - upload your documents in the web-ui

No code needed, you have your self hosted LLM with basic RAG giving you answers with your documents in context. For us the deepseek coder 33b model is fast enough on a Mac Studio with 64gb ram and can give pretty good suggestions based on our internal coding documentation.
'''

from openai import OpenAI

client = OpenAI(
    base_url = 'http://localhost:11434/v1',
    api_key='ollama', # required, but unused
)

response = client.chat.completions.create(
  model="llama2",
  messages=[
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Who won the world series in 2020?"},
    {"role": "assistant", "content": "The LA Dodgers won in 2020."},
    {"role": "user", "content": "Where was it played?"}
  ]
)
print(response.choices[0].message.content)
	'''
	self host an inference server with basic rag support:
	- get a Mac Mini or Mac Studio - just run ollama serve, - run ollama web-ui in docker - add some coding assitant model from ollamahub with the web-ui - upload your documents in the web-ui

	No code needed, you have your self hosted LLM with basic RAG giving you answers with your documents in context. For us the deepseek coder 33b model is fast enough on a Mac Studio with 64gb ram and can give pretty good suggestions based on our internal coding documentation.
	'''

	from openai import OpenAI

	client = OpenAI(
	base_url = 'http://localhost:11434/v1',
	api_key='ollama', # required, but unused
	)

	response = client.chat.completions.create(
	model="llama2",
	messages=[
	{"role": "system", "content": "You are a helpful assistant."},
	{"role": "user", "content": "Who won the world series in 2020?"},
	{"role": "assistant", "content": "The LA Dodgers won in 2020."},
	{"role": "user", "content": "Where was it played?"}
	]
	)
	print(response.choices[0].message.content)