Skip to content

Instantly share code, notes, and snippets.

@zenyr
Last active May 6, 2024 07:15
Show Gist options
  • Save zenyr/353856de2ffd8a83698ae3038cb50250 to your computer and use it in GitHub Desktop.
Save zenyr/353856de2ffd8a83698ae3038cb50250 to your computer and use it in GitHub Desktop.
/*
# Ask Local LLM
*/
// Name: Ask Local LLM
// Author: Zenyr
// Twitter: @zenyr
import "@johnlindquist/kit";
const useGroq = false;
const useLlamaCpp = true;
const apiKey = await env("GROQ_API_KEY");
const baseURL = "http://studio.zenyr.net:2145/v1/";
const modelName = await (
await fetch(`${baseURL.replace("v1/", "")}health?include_slots`)
).json();
if (useLlamaCpp) {
const configs = {
stream: true,
temperature: 0.1,
repeat_penalty: 1.05,
cache_prompt: true,
system_prompt: {
prompt: `Transcript of a never ending dialog, where the User interacts with an Assistant.\nThe Assistant is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.`,
},
// stop: ["</s>", "<|im_end|>", "<|end|>", "<|assistant|>", "<|eot_id|>"],
};
async function* llama(prompt: string): AsyncGenerator<string> {
const controller = new AbortController();
const params = { prompt, ...configs };
let generation_settings: unknown;
const response = await fetch(`${baseURL.replace("v1/", "")}completion`, {
method: "POST",
body: JSON.stringify(params),
headers: {
Connection: "keep-alive",
"Content-Type": "application/json",
Accept: "text/event-stream",
},
signal: controller.signal,
});
const reader = response.body.getReader();
const decoder = new TextDecoder();
let content = "";
let leftover = ""; // Buffer for partially read lines
try {
let cont = true;
while (cont) {
const result = await reader.read();
if (result.done) {
break;
}
// Add any leftover data to the current chunk of data
const text = leftover + decoder.decode(result.value);
// Check if the last character is a line break
const endsWithLineBreak = text.endsWith("\n");
// Split the text into lines
let lines = text.split("\n");
// If the text doesn't end with a line break, then the last line is incomplete
// Store it in leftover to be added to the next chunk of data
if (!endsWithLineBreak) {
leftover = lines.pop();
} else {
leftover = ""; // Reset leftover if we have a line break at the end
}
// Parse all sse events and add them to result
const regex = /^(\S+):\s(.*)$/gm;
for (const line of lines) {
const match = regex.exec(line);
if (match) {
const parsed: Record<string, unknown> = {};
parsed[match[1]] = match[2];
if (parsed.data) {
const chunk: {
content: string;
stop: boolean;
generation_settings?: unknown;
} = JSON.parse(parsed.data as string);
content += chunk.content;
yield chunk.content;
if (chunk.stop) {
cont = false;
if (chunk.generation_settings) {
generation_settings = chunk.generation_settings;
}
break;
}
}
if (parsed.error) {
const errChunk: {
message: string;
code: number;
type: string;
} = JSON.parse(parsed.error as string);
if (errChunk.message.includes("slot unavailable")) {
// Throw an error to be caught by upstream callers
throw new Error("slot unavailable");
} else {
console.error(
`llama.cpp error [${errChunk.code} - ${errChunk.type}]: ${errChunk.message}`
);
}
}
}
}
}
} catch (e) {
if (e.name !== "AbortError") {
console.error("llama error: ", e);
}
throw e;
} finally {
controller.abort();
}
return content;
}
let currentMessage = "";
let controller: AbortController;
await chat({
placeholder: JSON.stringify(modelName),
shortcuts: [
{
name: `Close`,
key: `${cmd}+w`,
onPress: () => {
process.exit();
},
bar: "left",
},
{
name: `Continue Script`,
key: `${cmd}+enter`,
onPress: () => {
submit("");
},
bar: "right",
},
],
onEscape: async () => {
if (controller) controller.abort();
},
onSubmit: async (input) => {
if (!input) return;
setLoading(true);
chat.addMessage("");
controller = new AbortController();
try {
currentMessage = "";
for await (const chunk of llama(input)) {
// Print the completion returned by the LLM.
currentMessage += chunk || "";
const html = md(currentMessage);
chat.setMessage(-1, html);
if (controller.signal.aborted) {
break;
}
}
// await memory.saveContext({ input }, { output: currentMessage });
} finally {
setLoading(false);
controller = null;
}
},
});
} else if (!useGroq) {
await npm("@langchain/openai");
// await npm("@langchain/core/prompts");
// await npm("@langchain/core/runnables");
// await npm("langchain/memory");
const { ChatOpenAI } = await import("@langchain/openai");
const { ChatPromptTemplate, MessagesPlaceholder } = await import(
"@langchain/core/prompts"
);
const { RunnableSequence } = await import("@langchain/core/runnables");
const { BufferMemory } = await import("langchain/memory");
const chatModel = new ChatOpenAI({
// model: "llama3-8b-8192",
apiKey,
temperature: 0.1,
configuration: { baseURL },
stop: ["</s>", "<|im_end|>", "<|end|>", "<|assistant|>", "<|eot_id|>"],
});
const prompt = ChatPromptTemplate.fromMessages([
// ["system", "You are a world class technical documentation writer."],
new MessagesPlaceholder("history"),
["user", "{input}"],
]);
const memory = new BufferMemory({
returnMessages: true,
inputKey: "input",
outputKey: "output",
memoryKey: "history",
});
await memory.loadMemoryVariables({});
// const chain = prompt.pipe(chatModel);
const chain = RunnableSequence.from([
{
input: (initialInput) => initialInput.input,
memory: () => memory.loadMemoryVariables({}),
},
{
input: (previousOutput) => previousOutput.input,
history: (previousOutput) => previousOutput.memory.history,
},
prompt,
chatModel,
]);
let currentMessage = "";
let controller: AbortController;
await chat({
placeholder: JSON.stringify(modelName),
shortcuts: [
{
name: `Close`,
key: `${cmd}+w`,
onPress: () => {
process.exit();
},
bar: "left",
},
{
name: `Continue Script`,
key: `${cmd}+enter`,
onPress: () => {
submit("");
},
bar: "right",
},
],
onEscape: async () => {
if (controller) controller.abort();
},
onSubmit: async (input) => {
if (!input) return;
setLoading(true);
chat.addMessage("");
controller = new AbortController();
try {
const stream = await chain.stream({ input });
currentMessage = "";
for await (const chunk of stream) {
// Print the completion returned by the LLM.
currentMessage += chunk.content || "";
const html = md(currentMessage);
chat.setMessage(-1, html);
if (controller.signal.aborted) {
break;
}
}
await memory.saveContext({ input }, { output: currentMessage });
} finally {
setLoading(false);
controller = null;
}
},
});
} else {
await npm("groq-sdk");
const { default: Groq } = await import("groq-sdk");
const groq = new Groq({ apiKey, baseURL });
let currentMessage = "";
let controller: AbortController;
await chat({
shortcuts: [
{
name: `Close`,
key: `${cmd}+w`,
onPress: () => {
process.exit();
},
bar: "left",
},
{
name: `Continue Script`,
key: `${cmd}+enter`,
onPress: () => {
submit("");
},
bar: "right",
},
],
onEscape: async () => {
if (controller) controller.abort();
},
onSubmit: async (input) => {
if (!input) return;
setLoading(true);
chat.addMessage("");
controller = new AbortController();
try {
const stream = await groq.chat.completions.create(
{
stream: true,
model: "llama3-chatqa-1.5-8b",
temperature: 0.1,
messages: [{ role: "user", content: input }],
},
{ signal: controller.signal }
);
currentMessage = "";
for await (const chunk of stream) {
// Print the completion returned by the LLM.
currentMessage += chunk.choices[0]?.delta?.content || "";
const html = md(currentMessage);
chat.setMessage(-1, html);
}
} finally {
setLoading(false);
controller = null;
}
},
});
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment