Last active
May 6, 2024 07:15
-
-
Save zenyr/353856de2ffd8a83698ae3038cb50250 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
# Ask Local LLM | |
*/ | |
// Name: Ask Local LLM | |
// Author: Zenyr | |
// Twitter: @zenyr | |
import "@johnlindquist/kit"; | |
const useGroq = false; | |
const useLlamaCpp = true; | |
const apiKey = await env("GROQ_API_KEY"); | |
const baseURL = "http://studio.zenyr.net:2145/v1/"; | |
const modelName = await ( | |
await fetch(`${baseURL.replace("v1/", "")}health?include_slots`) | |
).json(); | |
if (useLlamaCpp) { | |
const configs = { | |
stream: true, | |
temperature: 0.1, | |
repeat_penalty: 1.05, | |
cache_prompt: true, | |
system_prompt: { | |
prompt: `Transcript of a never ending dialog, where the User interacts with an Assistant.\nThe Assistant is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.`, | |
}, | |
// stop: ["</s>", "<|im_end|>", "<|end|>", "<|assistant|>", "<|eot_id|>"], | |
}; | |
async function* llama(prompt: string): AsyncGenerator<string> { | |
const controller = new AbortController(); | |
const params = { prompt, ...configs }; | |
let generation_settings: unknown; | |
const response = await fetch(`${baseURL.replace("v1/", "")}completion`, { | |
method: "POST", | |
body: JSON.stringify(params), | |
headers: { | |
Connection: "keep-alive", | |
"Content-Type": "application/json", | |
Accept: "text/event-stream", | |
}, | |
signal: controller.signal, | |
}); | |
const reader = response.body.getReader(); | |
const decoder = new TextDecoder(); | |
let content = ""; | |
let leftover = ""; // Buffer for partially read lines | |
try { | |
let cont = true; | |
while (cont) { | |
const result = await reader.read(); | |
if (result.done) { | |
break; | |
} | |
// Add any leftover data to the current chunk of data | |
const text = leftover + decoder.decode(result.value); | |
// Check if the last character is a line break | |
const endsWithLineBreak = text.endsWith("\n"); | |
// Split the text into lines | |
let lines = text.split("\n"); | |
// If the text doesn't end with a line break, then the last line is incomplete | |
// Store it in leftover to be added to the next chunk of data | |
if (!endsWithLineBreak) { | |
leftover = lines.pop(); | |
} else { | |
leftover = ""; // Reset leftover if we have a line break at the end | |
} | |
// Parse all sse events and add them to result | |
const regex = /^(\S+):\s(.*)$/gm; | |
for (const line of lines) { | |
const match = regex.exec(line); | |
if (match) { | |
const parsed: Record<string, unknown> = {}; | |
parsed[match[1]] = match[2]; | |
if (parsed.data) { | |
const chunk: { | |
content: string; | |
stop: boolean; | |
generation_settings?: unknown; | |
} = JSON.parse(parsed.data as string); | |
content += chunk.content; | |
yield chunk.content; | |
if (chunk.stop) { | |
cont = false; | |
if (chunk.generation_settings) { | |
generation_settings = chunk.generation_settings; | |
} | |
break; | |
} | |
} | |
if (parsed.error) { | |
const errChunk: { | |
message: string; | |
code: number; | |
type: string; | |
} = JSON.parse(parsed.error as string); | |
if (errChunk.message.includes("slot unavailable")) { | |
// Throw an error to be caught by upstream callers | |
throw new Error("slot unavailable"); | |
} else { | |
console.error( | |
`llama.cpp error [${errChunk.code} - ${errChunk.type}]: ${errChunk.message}` | |
); | |
} | |
} | |
} | |
} | |
} | |
} catch (e) { | |
if (e.name !== "AbortError") { | |
console.error("llama error: ", e); | |
} | |
throw e; | |
} finally { | |
controller.abort(); | |
} | |
return content; | |
} | |
let currentMessage = ""; | |
let controller: AbortController; | |
await chat({ | |
placeholder: JSON.stringify(modelName), | |
shortcuts: [ | |
{ | |
name: `Close`, | |
key: `${cmd}+w`, | |
onPress: () => { | |
process.exit(); | |
}, | |
bar: "left", | |
}, | |
{ | |
name: `Continue Script`, | |
key: `${cmd}+enter`, | |
onPress: () => { | |
submit(""); | |
}, | |
bar: "right", | |
}, | |
], | |
onEscape: async () => { | |
if (controller) controller.abort(); | |
}, | |
onSubmit: async (input) => { | |
if (!input) return; | |
setLoading(true); | |
chat.addMessage(""); | |
controller = new AbortController(); | |
try { | |
currentMessage = ""; | |
for await (const chunk of llama(input)) { | |
// Print the completion returned by the LLM. | |
currentMessage += chunk || ""; | |
const html = md(currentMessage); | |
chat.setMessage(-1, html); | |
if (controller.signal.aborted) { | |
break; | |
} | |
} | |
// await memory.saveContext({ input }, { output: currentMessage }); | |
} finally { | |
setLoading(false); | |
controller = null; | |
} | |
}, | |
}); | |
} else if (!useGroq) { | |
await npm("@langchain/openai"); | |
// await npm("@langchain/core/prompts"); | |
// await npm("@langchain/core/runnables"); | |
// await npm("langchain/memory"); | |
const { ChatOpenAI } = await import("@langchain/openai"); | |
const { ChatPromptTemplate, MessagesPlaceholder } = await import( | |
"@langchain/core/prompts" | |
); | |
const { RunnableSequence } = await import("@langchain/core/runnables"); | |
const { BufferMemory } = await import("langchain/memory"); | |
const chatModel = new ChatOpenAI({ | |
// model: "llama3-8b-8192", | |
apiKey, | |
temperature: 0.1, | |
configuration: { baseURL }, | |
stop: ["</s>", "<|im_end|>", "<|end|>", "<|assistant|>", "<|eot_id|>"], | |
}); | |
const prompt = ChatPromptTemplate.fromMessages([ | |
// ["system", "You are a world class technical documentation writer."], | |
new MessagesPlaceholder("history"), | |
["user", "{input}"], | |
]); | |
const memory = new BufferMemory({ | |
returnMessages: true, | |
inputKey: "input", | |
outputKey: "output", | |
memoryKey: "history", | |
}); | |
await memory.loadMemoryVariables({}); | |
// const chain = prompt.pipe(chatModel); | |
const chain = RunnableSequence.from([ | |
{ | |
input: (initialInput) => initialInput.input, | |
memory: () => memory.loadMemoryVariables({}), | |
}, | |
{ | |
input: (previousOutput) => previousOutput.input, | |
history: (previousOutput) => previousOutput.memory.history, | |
}, | |
prompt, | |
chatModel, | |
]); | |
let currentMessage = ""; | |
let controller: AbortController; | |
await chat({ | |
placeholder: JSON.stringify(modelName), | |
shortcuts: [ | |
{ | |
name: `Close`, | |
key: `${cmd}+w`, | |
onPress: () => { | |
process.exit(); | |
}, | |
bar: "left", | |
}, | |
{ | |
name: `Continue Script`, | |
key: `${cmd}+enter`, | |
onPress: () => { | |
submit(""); | |
}, | |
bar: "right", | |
}, | |
], | |
onEscape: async () => { | |
if (controller) controller.abort(); | |
}, | |
onSubmit: async (input) => { | |
if (!input) return; | |
setLoading(true); | |
chat.addMessage(""); | |
controller = new AbortController(); | |
try { | |
const stream = await chain.stream({ input }); | |
currentMessage = ""; | |
for await (const chunk of stream) { | |
// Print the completion returned by the LLM. | |
currentMessage += chunk.content || ""; | |
const html = md(currentMessage); | |
chat.setMessage(-1, html); | |
if (controller.signal.aborted) { | |
break; | |
} | |
} | |
await memory.saveContext({ input }, { output: currentMessage }); | |
} finally { | |
setLoading(false); | |
controller = null; | |
} | |
}, | |
}); | |
} else { | |
await npm("groq-sdk"); | |
const { default: Groq } = await import("groq-sdk"); | |
const groq = new Groq({ apiKey, baseURL }); | |
let currentMessage = ""; | |
let controller: AbortController; | |
await chat({ | |
shortcuts: [ | |
{ | |
name: `Close`, | |
key: `${cmd}+w`, | |
onPress: () => { | |
process.exit(); | |
}, | |
bar: "left", | |
}, | |
{ | |
name: `Continue Script`, | |
key: `${cmd}+enter`, | |
onPress: () => { | |
submit(""); | |
}, | |
bar: "right", | |
}, | |
], | |
onEscape: async () => { | |
if (controller) controller.abort(); | |
}, | |
onSubmit: async (input) => { | |
if (!input) return; | |
setLoading(true); | |
chat.addMessage(""); | |
controller = new AbortController(); | |
try { | |
const stream = await groq.chat.completions.create( | |
{ | |
stream: true, | |
model: "llama3-chatqa-1.5-8b", | |
temperature: 0.1, | |
messages: [{ role: "user", content: input }], | |
}, | |
{ signal: controller.signal } | |
); | |
currentMessage = ""; | |
for await (const chunk of stream) { | |
// Print the completion returned by the LLM. | |
currentMessage += chunk.choices[0]?.delta?.content || ""; | |
const html = md(currentMessage); | |
chat.setMessage(-1, html); | |
} | |
} finally { | |
setLoading(false); | |
controller = null; | |
} | |
}, | |
}); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment