Skip to content

Instantly share code, notes, and snippets.

@bogdad
Created April 15, 2023 13:20
Show Gist options
  • Save bogdad/a0899615ebcf3b61934e9daa983a24de to your computer and use it in GitHub Desktop.
Save bogdad/a0899615ebcf3b61934e9daa983a24de to your computer and use it in GitHub Desktop.
based on benchmark_threads.csv
import subprocess
import matplotlib.pyplot as plt
import re
# Defining the command template
cmd = "./build/bin/main \
--seed 147369852 \
--threads {threads} \
--n_predict 64 \
--model ./models/7B/ggml-model-q4_0.bin \
--top_k 40 \
--top_p 0.9 \
--temp 0.5 \
--repeat_last_n 64 \
--repeat_penalty 1.1 \
-p \"Write a funny joke:\""
# Defining the range of threads to loop over
min_threads = 4
max_threads = 10
step = 2
# Defining the number of runs for each thread cmd evaluation
n_runs = 5
# Initializing the lists to store the results
threads_list = []
token_time_list = []
itms = []
for threads in range(min_threads, max_threads + 1, step):
print(f"Running with {threads} threads...")
token_times = []
for run in range(n_runs):
result = subprocess.run(cmd.format(threads=threads), stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True)
output = result.stdout.decode()
print(output)
# Extracting the token time using regular expression
token_time = float(re.search(r"\s+(\d+\.\d+) ms per token", output).group(1))
print(f"\t {threads} threads | run {run+1}/{n_runs} | current token time {round(token_time, 2)} ms")
token_times.append(token_time)
# Get the average token time for the current number of threads
avg_token_time = sum(token_times) / len(token_times)
token_time_list.append(avg_token_time)
threads_list.append(threads)
itm={}
itm["t"]=threads
itm["at"]=avg_token_time
itms.append(itm)
for itm in itms:
t = itm["t"]
at = itm["at"]
print(f"{t},{at}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment