Skip to content

Instantly share code, notes, and snippets.

@cyyynthia
Last active March 31, 2023 14:19
Show Gist options
  • Save cyyynthia/43784451936e2a608566c42b0bacceac to your computer and use it in GitHub Desktop.
Save cyyynthia/43784451936e2a608566c42b0bacceac to your computer and use it in GitHub Desktop.
Quick and dirty script to measure the evolution of performance of LLaMA.cpp
/*!
* BSD Zero Clause License
* Copyright (c) Cynthia Rey
*
* Permission to use, copy, modify, and/or distribute this software for any
* purpose with or without fee is hereby granted.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
* REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
* AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
* INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
* LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
* OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
* PERFORMANCE OF THIS SOFTWARE.
*/
import { spawn, execSync } from 'child_process'
import { open } from 'fs/promises'
const LLAMA_PARAMETERS = [
'-t', '4',
'-p', 'Here is a long story about how programming came to be:',
'-n', '256',
'-c', '1024',
'--top_k', '40',
'--top_p', '0.95',
'--repeat_last_n', '64',
'--repeat_penalty', '1.1',
]
const LLAMA_START_AT = '2d64715ad475f192a4004a52d134c67ccb6f44ad'
const LLAMA_GGMF_SINCE = '074bea2eb1f1349a0118239c4152914aecaa1be4'
const LLAMA_GGJT_SINCE = '78ca9838ee36660a776e97e3391b6fb5dcaacf7f'
const IMPORTANT_FILES = [
'ggml.c', 'ggml.h',
'llama.cpp', 'llama.h',
'examples/common.cpp', 'examples/common.h',
'examples/main/main.cpp',
]
const FORMATS = {
unversioned: './models/7B/ggml-model-q4_0.unversioned.bin',
ggmf: './models/7B/ggml-model-q4_0.ggmf.bin',
ggjt: './models/7B/ggml-model-q4_0.bin',
}
const csv = await open('./result.csv', 'w')
csv.appendFile('commit,format,load time,sample time,prompt eval time,eval time,total time,eval token time,max memory rss (KB)\n')
function getMemoryUsageOf (pid) {
return Number(execSync(`ps -p ${pid} -o rss`).subarray(6).toString())
}
async function runLlama (commit, format) {
return new Promise((resolve) => {
const llama = spawn('./main', [ ...LLAMA_PARAMETERS, '-m', FORMATS[format] ])
// Keep used memory information
// Ugly way of doing it but I cba to make it better :D
let usedMemory = 0
const memCheck = setInterval(() => (usedMemory = Math.max(usedMemory, getMemoryUsageOf(llama.pid))), 5e3)
let dnf = false
const to = setTimeout(() => {
dnf = true
llama.kill('SIGKILL')
console.log('Warn: DNF (timeout)')
}, 300e3)
const stderrChunks = []
llama.stderr.on('data', (d) => stderrChunks.push(d))
llama.on('exit', () => {
clearInterval(memCheck)
clearTimeout(to)
// DNF; go next
if (dnf) return resolve()
const stderr = Buffer.concat(stderrChunks).toString()
const timings = stderr.matchAll(/(?:llama_print_timings|main):.*?time =\s+([\d\.]+)/g)
const tokTime = stderr.match(/eval time.*?\(\s+([\d\.]+) ms per run|\/\s+([\d.]+) ms/)
const loadTime = timings.next().value[1]
const sampleTime = timings.next().value[1]
const promptEvalTime = stderr.includes('prompt eval time') ? timings.next().value[1] : ''
const evalTime = timings.next().value[1]
const totalTime = timings.next().value[1]
const tokenTime = tokTime[1] ?? tokTime[2]
csv.appendFile(`${commit},${format},${loadTime},${sampleTime},${promptEvalTime},${evalTime},${totalTime},${tokenTime},${usedMemory}\n`)
resolve()
})
})
}
function checkoutLlama (commit) {
execSync('make clean', { stdio: 'ignore' })
execSync(`git checkout ${commit}`, { stdio: 'ignore' })
execSync('make -j main', { stdio: 'ignore' })
}
function getCommits () {
// This function excludes commits which did not modify llama-related files.
// It's not made in the most efficient way, but whatever :D
const allCommits = execSync('git log --pretty=format:"%H"').toString().split('\n')
const importantCommits = new Set()
for (const file of IMPORTANT_FILES) {
execSync(`git log --pretty=format:"%H" --follow '${file}'`).toString().split('\n')
.forEach((c) => importantCommits.add(c))
}
return allCommits.slice(0, allCommits.indexOf(LLAMA_START_AT) + 1).reverse()
.filter((c) => importantCommits.has(c))
}
const commits = getCommits()
const ggmfChangeIdx = commits.indexOf(LLAMA_GGMF_SINCE)
const ggjtChangeIdx = commits.indexOf(LLAMA_GGJT_SINCE)
for (let i = 0; i < commits.length; i++) {
const commit = commits[i]
const format = i < ggmfChangeIdx
? 'unversioned'
: i < ggjtChangeIdx
? 'ggmf'
: 'ggjt'
console.log('Processing commit %s (%d/%d)', commit, i, commits.length)
try {
checkoutLlama(commit)
} catch {
console.log('Warn: DNF (compile error)')
continue
}
try {
await runLlama(commit, format)
} catch {
console.log('Warn: DNF (runtime error)')
continue
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment