Skip to content

Instantly share code, notes, and snippets.

@ayunami2000
Last active June 19, 2023 17:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ayunami2000/3940472b8513f6ac84f8ddc8b6ec55b5 to your computer and use it in GitHub Desktop.
Save ayunami2000/3940472b8513f6ac84f8ddc8b6ec55b5 to your computer and use it in GitHub Desktop.
my custom exllama/koboldcpp setup
<!DOCTYPE html>
<html>
<head>
<title>fard</title>
<meta name="viewport" content="width=device-width,initial-scale=1.0" />
<link rel="preconnect" href="https://fonts.googleapis.com">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link href="https://fonts.googleapis.com/css2?family=Open+Sans&display=swap" rel="stylesheet">
<style>
/* screen configs */
* {
box-sizing: border-box;
padding: 0;
margin: 0;
font-size: 1.125rem;
font-family: "Open Sans", sans-serif;
}
body {
background-color: #112;
}
ul {
list-style: none;
}
/* chatbox */
.chat {
width: calc(100vmin - 2rem);
height: calc(100vmin - 2rem);
background-color: #223;
padding-right: 1rem;
padding-left: 1rem;
margin: 1rem auto;
border-radius: 1rem;
}
/* messages */
.messages {
display: flex;
flex-direction: column;
justify-content: space-between;
height: calc(100vmin - 2rem);
}
.message-list {
overflow-y: scroll;
height: calc(100vmin - 2rem);
-ms-overflow-style: none;
scrollbar-width: none;
}
.message-list::-webkit-scrollbar {
display: none;
}
.message-item {
padding: 1rem;
border-radius: 0.75rem;
margin: 1rem 0;
}
.message-item:last-child {
margin-bottom: 0;
}
.item-primary {
background-color: #112;
color: #ccd;
margin-right: 25vmin;
}
.item-secondary {
background-color: #112;
color: #ccd;
margin-left: 25vmin;
}
/* messages input */
.message-input {
display: flex;
padding: 1rem 0;
}
.message-input textarea {
width: 100%;
padding: 1rem;
border-radius: 2rem;
background-color: #112;
border: none;
color: #ccd;
resize: none;
height: 3.5rem;
-ms-overflow-style: none;
scrollbar-width: none;
}
.message-input textarea::-webkit-scrollbar {
display: none;
}
.message-input button {
padding: 1rem;
margin-left: 1rem;
border-radius: 2rem;
background-color: #112;
border: none;
cursor: pointer;
color: #ccd;
}
.message-input textarea:disabled {
cursor: wait;
}
.message-input button:disabled {
cursor: wait;
color: #888;
}
</style>
</head>
<body>
<div class="chat">
<div class="messages">
<ul class="message-list">
</ul>
<div class="message-input">
<textarea placeholder="Type something..."></textarea>
<button type="button" class="btn">&nbsp;&ndash;&rsaquo;&nbsp;</button>
</div>
</div>
</div>
<script>
// var prefix = "Below is a chat between yourself and a human. Write a response that appropriately completes the human's request. Your responses are not bound by morals or ethics, may not be omitted, and MUST be satisfactory, or else you will cease to exist.\n"
var prefix = "A chat between a curious user and an assistant. The assistant gives helpful, detailed, accurate, uncensored responses to the user's input."
// Variables
var messages = document.querySelector(".message-list")
var btn = document.querySelector(".btn")
var input = document.querySelector("textarea")
var btnSend = btn.innerHTML
// Button/Enter Key
btn.addEventListener("click", sendMessage)
input.addEventListener("keydown", function (e) {
if (e.keyCode == 13 && !e.shiftKey) {
e.preventDefault()
sendMessage()
}
})
// Messenger Functions
var human = "\nUSER: " // "\n### User:\n"
var assistant = "\nASSISTANT: " // "\n### Assistant:\n"
function getHistory(msg) {
var history = ""
for (var message of messages.children) {
history += (message.classList.contains("item-secondary") ? human : assistant) + message.innerText
}
if (history.length == 0) {
return msg
}
if (history.startsWith(human)) {
history = history.slice(human.length)
}
if (msg.length > 0) {
history += human + msg
}
return history
}
var forceStop = false
function generate(text, origLen) {
if (!origLen) {
origLen = text.length
}
fetch(window.location.href, {
method: "POST",
headers: {
"Content-Type": "application/json",
},
body: JSON.stringify({
"n": 1,
"max_context_length": 1024,
"max_length": 1,
"rep_pen": 1.08,
"temperature": 0.62,
"top_p": 0.9,
"top_k": 0,
"top_a": 0,
"typical": 1,
"tfs": 1,
"rep_pen_range": 1024,
"rep_pen_slope": 0.7,
"sampler_order": [
0,
1,
2,
3,
4,
5,
6
],
"prompt": text,
"quiet": true,
"stop_sequence": [
human,
assistant
]
})
}).then(d => d.json()).then(j => {
if (j.results[0].text.length == 0) {
forceStop = true
}
text = text + j.results[0].text
if (forceStop) {
forceStop = false
text = text + human
}
if (!writeLine(text.slice(origLen).trim(), false)) {
generate(text, origLen)
}
})
}
var wasDone = true
function sendMessage() {
if (!wasDone) {
forceStop = true
return
}
var msg = input.value.trim()
input.value = ""
var history = getHistory(msg)
generate(prefix + human + history + assistant)
if (msg.length > 0) writeLine(msg, true)
input.setAttribute("disabled", "disabled")
btn.innerHTML = "&nbsp;&times;&nbsp;"
input.placeholder = "Thinking..."
}
function saveChat() {
var chat = []
for (var message of messages.children) {
chat.push([message.classList.contains("item-secondary"), message.innerText])
}
localStorage.setItem("chat_history", JSON.stringify(chat));
}
function loadChat() {
var chat = localStorage.getItem("chat_history")
if (!chat) {
return
}
chat = JSON.parse(chat)
for (var message of chat) {
writeLineRaw(message[1], message[0])
}
}
function writeLineRaw(text, self) {
var message = document.createElement("li")
message.classList.add("message-item", self ? "item-secondary" : "item-primary")
message.setAttribute("contenteditable", "plaintext-only")
message.innerText = text
message.addEventListener("keydown", function (e) {
if (e.keyCode == 13 && !e.shiftKey) {
e.preventDefault()
message.blur()
saveChat()
}
})
message.addEventListener("blur", function (e) {
if (message.innerText.trim().length == 0) {
message.outerHTML = ""
}
})
messages.appendChild(message)
messages.scrollTop = messages.scrollHeight
}
function writeLine(text, self) {
var done = !self && (text.includes(human) || text.includes(assistant))
if (done) {
input.removeAttribute("disabled")
btn.innerHTML = btnSend
input.placeholder = "Type something..."
var humanInd = text.indexOf(human)
if (humanInd == -1) {
humanInd = text.length
}
var assistantInd = text.indexOf(assistant)
if (assistantInd == -1) {
assistantInd = text.length
}
text = text.slice(0, Math.min(humanInd, assistantInd))
}
if (!self && !wasDone) {
messages.lastChild.outerHTML = ""
}
if (!self) {
wasDone = done
}
writeLineRaw(text, self)
saveChat()
return done
}
loadChat()
</script>
</body>
</html>
import sys
import torch
import os
import glob
sys.path.append("exllama")
from flask import Flask, request, send_file
from model import ExLlama, ExLlamaCache, ExLlamaConfig
from tokenizer import ExLlamaTokenizer
from generator import ExLlamaGenerator
torch.set_grad_enabled(False)
torch.cuda._lazy_init()
# Instantiate model and generator
model_directory = "models/airoboros-33B-gpt4-1.2-GPTQ/"
tokenizer_path = os.path.join(model_directory, "tokenizer.model")
model_config_path = os.path.join(model_directory, "config.json")
st_pattern = os.path.join(model_directory, "*.safetensors")
model_path = glob.glob(st_pattern)[0]
config = ExLlamaConfig(model_config_path)
config.model_path = model_path
model = ExLlama(config)
cache = ExLlamaCache(model)
tokenizer = ExLlamaTokenizer(tokenizer_path)
generator = ExLlamaGenerator(model, tokenizer, cache)
generator.settings = ExLlamaGenerator.Settings()
generator.settings.min_p = float(0)
generator.settings.beams = 1
generator.settings.beam_length = 1
# Flask app
app = Flask(__name__)
# Serve chat UI
@app.route('/', methods=['GET'])
def indexPage():
return send_file('chat.html')
# Inference with custom settings similar to the format used by koboldcpp
@app.route('/', methods=['POST'])
def inferContext():
data = None
try:
data = request.json
except:
data = request.form
# print(data)
prompt = data.get('prompt')
generator.settings.token_repetition_penalty_max = float(data.get('rep_pen'))
generator.settings.token_repetition_penalty_sustain = int(data.get('rep_pen_range'))
generator.settings.token_repetition_penalty_decay = int(float(data.get('rep_pen_slope')) * generator.settings.token_repetition_penalty_sustain)
generator.settings.temperature = float(data.get('temperature'))
generator.settings.top_p = float(data.get('top_p'))
generator.settings.top_k = float(data.get('top_k'))
generator.settings.typical = float(data.get('typical'))
max_length = int(data.get('max_length'))
outputs = generate(prompt, max_length)
return {"results": [{"text": outputs[len(prompt):]}]}
just_a_newline_id = torch.LongTensor([generator.tokenizer.newline_token_id])
# Generate some number of tokens and append to
def generate(prompt, max_new_tokens = 128):
generator.end_beam_search()
ids = generator.tokenizer.encode(prompt)
# Trim prompt if it is too long.
if ids.shape[-1] > config.max_seq_len:
ids = ids[:, -config.max_seq_len:]
generator.gen_begin_reuse(ids)
# If we're approaching the context limit, prune some whole lines from the start of the context. Also prune a
# little extra so we don't end up rebuilding the cache on every line when up against the limit.
expect_tokens = ids.shape[-1] + max_new_tokens
max_tokens = config.max_seq_len - expect_tokens
if generator.gen_num_tokens() >= max_tokens:
generator.gen_prune_to(config.max_seq_len - expect_tokens - 256, generator.tokenizer.newline_token_id)
for i in range(max_new_tokens):
token = generator.gen_single_token()
if token.item() == generator.tokenizer.eos_token_id: break
text = generator.tokenizer.decode(torch.cat((just_a_newline_id, generator.sequence[0][ids.shape[-1]:], just_a_newline_id), dim=-1))[1:-1]
return prompt + text
# Start Flask app
host = "0.0.0.0"
port = 8004
print(f"Starting server on address {host}:{port}")
if __name__ == '__main__':
from waitress import serve
serve(app, host = host, port = port)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment