zenyr/ask-chatqa.ts

## ask-chatqa.ts
/*
# Ask Local LLM
*/
// Name: Ask Local LLM
// Author: Zenyr
// Twitter: @zenyr

import "@johnlindquist/kit";
const useGroq = false;
const useLlamaCpp = true;
const apiKey = await env("GROQ_API_KEY");
const baseURL = "http://studio.zenyr.net:2145/v1/";

const modelName = await (
  await fetch(`${baseURL.replace("v1/", "")}health?include_slots`)
).json();
if (useLlamaCpp) {
  const configs = {
    stream: true,
    temperature: 0.1,
    repeat_penalty: 1.05,
    cache_prompt: true,
    system_prompt: {
      prompt: `Transcript of a never ending dialog, where the User interacts with an Assistant.\nThe Assistant is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.`,
    },
    // stop: ["</s>", "<|im_end|>", "<|end|>", "<|assistant|>", "<|eot_id|>"],
  };
  async function* llama(prompt: string): AsyncGenerator<string> {
    const controller = new AbortController();
    const params = { prompt, ...configs };
    let generation_settings: unknown;
    const response = await fetch(`${baseURL.replace("v1/", "")}completion`, {
      method: "POST",
      body: JSON.stringify(params),
      headers: {
        Connection: "keep-alive",
        "Content-Type": "application/json",
        Accept: "text/event-stream",
      },
      signal: controller.signal,
    });

    const reader = response.body.getReader();
    const decoder = new TextDecoder();

    let content = "";
    let leftover = ""; // Buffer for partially read lines

    try {
      let cont = true;
      while (cont) {
        const result = await reader.read();
        if (result.done) {
          break;
        }

        // Add any leftover data to the current chunk of data
        const text = leftover + decoder.decode(result.value);

        // Check if the last character is a line break
        const endsWithLineBreak = text.endsWith("\n");

        // Split the text into lines
        let lines = text.split("\n");

        // If the text doesn't end with a line break, then the last line is incomplete
        // Store it in leftover to be added to the next chunk of data
        if (!endsWithLineBreak) {
          leftover = lines.pop();
        } else {
          leftover = ""; // Reset leftover if we have a line break at the end
        }

        // Parse all sse events and add them to result
        const regex = /^(\S+):\s(.*)$/gm;
        for (const line of lines) {
          const match = regex.exec(line);
          if (match) {
            const parsed: Record<string, unknown> = {};
            parsed[match[1]] = match[2];
            if (parsed.data) {
              const chunk: {
                content: string;
                stop: boolean;
                generation_settings?: unknown;
              } = JSON.parse(parsed.data as string);

              content += chunk.content;
              yield chunk.content;
              if (chunk.stop) {
                cont = false;
                if (chunk.generation_settings) {
                  generation_settings = chunk.generation_settings;
                }
                break;
              }
            }
            if (parsed.error) {
              const errChunk: {
                message: string;
                code: number;
                type: string;
              } = JSON.parse(parsed.error as string);
              if (errChunk.message.includes("slot unavailable")) {
                // Throw an error to be caught by upstream callers
                throw new Error("slot unavailable");
              } else {
                console.error(
                  `llama.cpp error [${errChunk.code} - ${errChunk.type}]: ${errChunk.message}`
                );
              }
            }
          }
        }
      }
    } catch (e) {
      if (e.name !== "AbortError") {
        console.error("llama error: ", e);
      }
      throw e;
    } finally {
      controller.abort();
    }

    return content;
  }

  let currentMessage = "";
  let controller: AbortController;
  await chat({
    placeholder: JSON.stringify(modelName),
    shortcuts: [
      {
        name: `Close`,
        key: `${cmd}+w`,
        onPress: () => {
          process.exit();
        },
        bar: "left",
      },
      {
        name: `Continue Script`,
        key: `${cmd}+enter`,
        onPress: () => {
          submit("");
        },
        bar: "right",
      },
    ],
    onEscape: async () => {
      if (controller) controller.abort();
    },
    onSubmit: async (input) => {
      if (!input) return;
      setLoading(true);
      chat.addMessage("");
      controller = new AbortController();
      try {
        currentMessage = "";
        for await (const chunk of llama(input)) {
          // Print the completion returned by the LLM.
          currentMessage += chunk || "";
          const html = md(currentMessage);
          chat.setMessage(-1, html);
          if (controller.signal.aborted) {
            break;
          }
        }
        // await memory.saveContext({ input }, { output: currentMessage });
      } finally {
        setLoading(false);
        controller = null;
      }
    },
  });
} else if (!useGroq) {
  await npm("@langchain/openai");
  //   await npm("@langchain/core/prompts");
  //   await npm("@langchain/core/runnables");
  // await npm("langchain/memory");

  const { ChatOpenAI } = await import("@langchain/openai");
  const { ChatPromptTemplate, MessagesPlaceholder } = await import(
    "@langchain/core/prompts"
  );
  const { RunnableSequence } = await import("@langchain/core/runnables");
  const { BufferMemory } = await import("langchain/memory");

  const chatModel = new ChatOpenAI({
    // model: "llama3-8b-8192",
    apiKey,
    temperature: 0.1,
    configuration: { baseURL },
    stop: ["</s>", "<|im_end|>", "<|end|>", "<|assistant|>", "<|eot_id|>"],
  });
  const prompt = ChatPromptTemplate.fromMessages([
    // ["system", "You are a world class technical documentation writer."],
    new MessagesPlaceholder("history"),
    ["user", "{input}"],
  ]);
  const memory = new BufferMemory({
    returnMessages: true,
    inputKey: "input",
    outputKey: "output",
    memoryKey: "history",
  });
  await memory.loadMemoryVariables({});

  //   const chain = prompt.pipe(chatModel);
  const chain = RunnableSequence.from([
    {
      input: (initialInput) => initialInput.input,
      memory: () => memory.loadMemoryVariables({}),
    },
    {
      input: (previousOutput) => previousOutput.input,
      history: (previousOutput) => previousOutput.memory.history,
    },
    prompt,
    chatModel,
  ]);

  let currentMessage = "";
  let controller: AbortController;
  await chat({
    placeholder: JSON.stringify(modelName),
    shortcuts: [
      {
        name: `Close`,
        key: `${cmd}+w`,
        onPress: () => {
          process.exit();
        },
        bar: "left",
      },
      {
        name: `Continue Script`,
        key: `${cmd}+enter`,
        onPress: () => {
          submit("");
        },
        bar: "right",
      },
    ],
    onEscape: async () => {
      if (controller) controller.abort();
    },
    onSubmit: async (input) => {
      if (!input) return;
      setLoading(true);
      chat.addMessage("");
      controller = new AbortController();
      try {
        const stream = await chain.stream({ input });
        currentMessage = "";
        for await (const chunk of stream) {
          // Print the completion returned by the LLM.
          currentMessage += chunk.content || "";
          const html = md(currentMessage);
          chat.setMessage(-1, html);
          if (controller.signal.aborted) {
            break;
          }
        }
        await memory.saveContext({ input }, { output: currentMessage });
      } finally {
        setLoading(false);
        controller = null;
      }
    },
  });
} else {
  await npm("groq-sdk");
  const { default: Groq } = await import("groq-sdk");
  const groq = new Groq({ apiKey, baseURL });

  let currentMessage = "";
  let controller: AbortController;
  await chat({
    shortcuts: [
      {
        name: `Close`,
        key: `${cmd}+w`,
        onPress: () => {
          process.exit();
        },
        bar: "left",
      },
      {
        name: `Continue Script`,
        key: `${cmd}+enter`,
        onPress: () => {
          submit("");
        },
        bar: "right",
      },
    ],
    onEscape: async () => {
      if (controller) controller.abort();
    },
    onSubmit: async (input) => {
      if (!input) return;
      setLoading(true);
      chat.addMessage("");
      controller = new AbortController();
      try {
        const stream = await groq.chat.completions.create(
          {
            stream: true,
            model: "llama3-chatqa-1.5-8b",
            temperature: 0.1,
            messages: [{ role: "user", content: input }],
          },
          { signal: controller.signal }
        );
        currentMessage = "";
        for await (const chunk of stream) {
          // Print the completion returned by the LLM.
          currentMessage += chunk.choices[0]?.delta?.content || "";
          const html = md(currentMessage);
          chat.setMessage(-1, html);
        }
      } finally {
        setLoading(false);
        controller = null;
      }
    },
  });
}
	/*
	# Ask Local LLM
	*/
	// Name: Ask Local LLM
	// Author: Zenyr
	// Twitter: @zenyr

	import "@johnlindquist/kit";
	const useGroq = false;
	const useLlamaCpp = true;
	const apiKey = await env("GROQ_API_KEY");
	const baseURL = "http://studio.zenyr.net:2145/v1/";

	const modelName = await (
	await fetch(`${baseURL.replace("v1/", "")}health?include_slots`)
	).json();
	if (useLlamaCpp) {
	const configs = {
	stream: true,
	temperature: 0.1,
	repeat_penalty: 1.05,
	cache_prompt: true,
	system_prompt: {
	prompt: `Transcript of a never ending dialog, where the User interacts with an Assistant.\nThe Assistant is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.`,
	},
	// stop: ["</s>", "<\|im_end\|>", "<\|end\|>", "<\|assistant\|>", "<\|eot_id\|>"],
	};
	async function* llama(prompt: string): AsyncGenerator<string> {
	const controller = new AbortController();
	const params = { prompt, ...configs };
	let generation_settings: unknown;
	const response = await fetch(`${baseURL.replace("v1/", "")}completion`, {
	method: "POST",
	body: JSON.stringify(params),
	headers: {
	Connection: "keep-alive",
	"Content-Type": "application/json",
	Accept: "text/event-stream",
	},
	signal: controller.signal,
	});

	const reader = response.body.getReader();
	const decoder = new TextDecoder();

	let content = "";
	let leftover = ""; // Buffer for partially read lines

	try {
	let cont = true;
	while (cont) {
	const result = await reader.read();
	if (result.done) {
	break;
	}

	// Add any leftover data to the current chunk of data
	const text = leftover + decoder.decode(result.value);

	// Check if the last character is a line break
	const endsWithLineBreak = text.endsWith("\n");

	// Split the text into lines
	let lines = text.split("\n");

	// If the text doesn't end with a line break, then the last line is incomplete
	// Store it in leftover to be added to the next chunk of data
	if (!endsWithLineBreak) {
	leftover = lines.pop();
	} else {
	leftover = ""; // Reset leftover if we have a line break at the end
	}

	// Parse all sse events and add them to result
	const regex = /^(\S+):\s(.*)$/gm;
	for (const line of lines) {
	const match = regex.exec(line);
	if (match) {
	const parsed: Record<string, unknown> = {};
	parsed[match[1]] = match[2];
	if (parsed.data) {
	const chunk: {
	content: string;
	stop: boolean;
	generation_settings?: unknown;
	} = JSON.parse(parsed.data as string);

	content += chunk.content;
	yield chunk.content;
	if (chunk.stop) {
	cont = false;
	if (chunk.generation_settings) {
	generation_settings = chunk.generation_settings;
	}
	break;
	}
	}
	if (parsed.error) {
	const errChunk: {
	message: string;
	code: number;
	type: string;
	} = JSON.parse(parsed.error as string);
	if (errChunk.message.includes("slot unavailable")) {
	// Throw an error to be caught by upstream callers
	throw new Error("slot unavailable");
	} else {
	console.error(
	`llama.cpp error [${errChunk.code} - ${errChunk.type}]: ${errChunk.message}`
	);
	}
	}
	}
	}
	}
	} catch (e) {
	if (e.name !== "AbortError") {
	console.error("llama error: ", e);
	}
	throw e;
	} finally {
	controller.abort();
	}

	return content;
	}

	let currentMessage = "";
	let controller: AbortController;
	await chat({
	placeholder: JSON.stringify(modelName),
	shortcuts: [
	{
	name: `Close`,
	key: `${cmd}+w`,
	onPress: () => {
	process.exit();
	},
	bar: "left",
	},
	{
	name: `Continue Script`,
	key: `${cmd}+enter`,
	onPress: () => {
	submit("");
	},
	bar: "right",
	},
	],
	onEscape: async () => {
	if (controller) controller.abort();
	},
	onSubmit: async (input) => {
	if (!input) return;
	setLoading(true);
	chat.addMessage("");
	controller = new AbortController();
	try {
	currentMessage = "";
	for await (const chunk of llama(input)) {
	// Print the completion returned by the LLM.
	currentMessage += chunk \|\| "";
	const html = md(currentMessage);
	chat.setMessage(-1, html);
	if (controller.signal.aborted) {
	break;
	}
	}
	// await memory.saveContext({ input }, { output: currentMessage });
	} finally {
	setLoading(false);
	controller = null;
	}
	},
	});
	} else if (!useGroq) {
	await npm("@langchain/openai");
	// await npm("@langchain/core/prompts");
	// await npm("@langchain/core/runnables");
	// await npm("langchain/memory");

	const { ChatOpenAI } = await import("@langchain/openai");
	const { ChatPromptTemplate, MessagesPlaceholder } = await import(
	"@langchain/core/prompts"
	);
	const { RunnableSequence } = await import("@langchain/core/runnables");
	const { BufferMemory } = await import("langchain/memory");

	const chatModel = new ChatOpenAI({
	// model: "llama3-8b-8192",
	apiKey,
	temperature: 0.1,
	configuration: { baseURL },
	stop: ["</s>", "<\|im_end\|>", "<\|end\|>", "<\|assistant\|>", "<\|eot_id\|>"],
	});
	const prompt = ChatPromptTemplate.fromMessages([
	// ["system", "You are a world class technical documentation writer."],
	new MessagesPlaceholder("history"),
	["user", "{input}"],
	]);
	const memory = new BufferMemory({
	returnMessages: true,
	inputKey: "input",
	outputKey: "output",
	memoryKey: "history",
	});
	await memory.loadMemoryVariables({});

	// const chain = prompt.pipe(chatModel);
	const chain = RunnableSequence.from([
	{
	input: (initialInput) => initialInput.input,
	memory: () => memory.loadMemoryVariables({}),
	},
	{
	input: (previousOutput) => previousOutput.input,
	history: (previousOutput) => previousOutput.memory.history,
	},
	prompt,
	chatModel,
	]);

	let currentMessage = "";
	let controller: AbortController;
	await chat({
	placeholder: JSON.stringify(modelName),
	shortcuts: [
	{
	name: `Close`,
	key: `${cmd}+w`,
	onPress: () => {
	process.exit();
	},
	bar: "left",
	},
	{
	name: `Continue Script`,
	key: `${cmd}+enter`,
	onPress: () => {
	submit("");
	},
	bar: "right",
	},
	],
	onEscape: async () => {
	if (controller) controller.abort();
	},
	onSubmit: async (input) => {
	if (!input) return;
	setLoading(true);
	chat.addMessage("");
	controller = new AbortController();
	try {
	const stream = await chain.stream({ input });
	currentMessage = "";
	for await (const chunk of stream) {
	// Print the completion returned by the LLM.
	currentMessage += chunk.content \|\| "";
	const html = md(currentMessage);
	chat.setMessage(-1, html);
	if (controller.signal.aborted) {
	break;
	}
	}
	await memory.saveContext({ input }, { output: currentMessage });
	} finally {
	setLoading(false);
	controller = null;
	}
	},
	});
	} else {
	await npm("groq-sdk");
	const { default: Groq } = await import("groq-sdk");
	const groq = new Groq({ apiKey, baseURL });

	let currentMessage = "";
	let controller: AbortController;
	await chat({
	shortcuts: [
	{
	name: `Close`,
	key: `${cmd}+w`,
	onPress: () => {
	process.exit();
	},
	bar: "left",
	},
	{
	name: `Continue Script`,
	key: `${cmd}+enter`,
	onPress: () => {
	submit("");
	},
	bar: "right",
	},
	],
	onEscape: async () => {
	if (controller) controller.abort();
	},
	onSubmit: async (input) => {
	if (!input) return;
	setLoading(true);
	chat.addMessage("");
	controller = new AbortController();
	try {
	const stream = await groq.chat.completions.create(
	{
	stream: true,
	model: "llama3-chatqa-1.5-8b",
	temperature: 0.1,
	messages: [{ role: "user", content: input }],
	},
	{ signal: controller.signal }
	);
	currentMessage = "";
	for await (const chunk of stream) {
	// Print the completion returned by the LLM.
	currentMessage += chunk.choices[0]?.delta?.content \|\| "";
	const html = md(currentMessage);
	chat.setMessage(-1, html);
	}
	} finally {
	setLoading(false);
	controller = null;
	}
	},
	});
	}